Squash into one commit:

1. merge python-package 2. add dump model to json 3. fix bugs 4. clean code with pylint 5. update python examples

Squash into one commit:
1. merge python-package 2. add dump model to json 3. fix bugs 4. clean code with pylint 5. update python examples
eba6d200 · wxchan · 19e085c9 · eba6d200 · eba6d200 · eba6d200
Commit eba6d200 authored Dec 02, 2016 by wxchan
20 changed files
--- a/.travis.yml
+++ b/.travis.yml
@@ -21,9 +21,13 @@ script:
 - cd $TRAVIS_BUILD_DIR
 - mkdir build && cd build && cmake .. && make -j
 - cd $TRAVIS_BUILD_DIR/tests/c_api_test && python test.py
+- cd $TRAVIS_BUILD_DIR/python-package && python setup.py install
+- cd $TRAVIS_BUILD_DIR/tests/python_package_test && python test_basic.py && python test_sklearn.py
 - cd $TRAVIS_BUILD_DIR
 - rm -rf build && mkdir build && cd build && cmake -DUSE_MPI=ON ..&& make -j
- cd $TRAVIS_BUILD_DIR/tests/c_api_test && python test.py
+- cd $TRAVIS_BUILD_DIR/tests/c_api_test && python test.py 
+- cd $TRAVIS_BUILD_DIR/python-package && python setup.py install
+- cd $TRAVIS_BUILD_DIR/tests/python_package_test && python test_basic.py && python test_sklearn.py

 notifications:
  email: false

--- a/README.md
+++ b/README.md
 LightGBM, Light Gradient Boosting Machine
-==========
+=========================================
 [![Build Status](https://travis-ci.org/Microsoft/LightGBM.svg?branch=master)](https://travis-ci.org/Microsoft/LightGBM)

 LightGBM is a gradient boosting framework that uses tree based learning algorithms. It is designed to be distributed and efficient with the following advantages:
@@ -14,6 +14,11 @@ For more details, please refer to [Features](https://github.com/Microsoft/LightG

 [Experiments](https://github.com/Microsoft/LightGBM/wiki/Experiments#comparison-experiment) on public datasets show that LightGBM can outperform other existing boosting framework on both efficiency and accuracy, with significant lower memory consumption. What's more, the [experiments](https://github.com/Microsoft/LightGBM/wiki/Experiments#parallel-experiment) show that LightGBM can achieve a linear speed-up by using multiple machines for training in specific settings.

+News
+----
+
+12/02/2012 : Release [python-package](https://github.com/Microsoft/LightGBM/tree/master/python-package) beta version, welcome to have a try and provide issues and feedback.
+
 Get Started
 ------------
 To get started, please follow the [Installation Guide](https://github.com/Microsoft/LightGBM/wiki/Installation-Guide) and [Quick Start](https://github.com/Microsoft/LightGBM/wiki/Quick-Start).

--- a/examples/python-guide/simple_example.py
+++ b/examples/python-guide/simple_example.py
+# coding: utf-8
+# pylint: disable = invalid-name, C0111
+import json
+import lightgbm as lgb
+import pandas as pd
+from sklearn.metrics import mean_squared_error
+
+# load or create your dataset
+df_train = pd.read_csv('../regression/regression.train', header=None, sep='\t')
+df_test = pd.read_csv('../regression/regression.test', header=None, sep='\t')
+
+y_train = df_train[0]
+y_test = df_test[0]
+X_train = df_train.drop(0, axis=1)
+X_test = df_test.drop(0, axis=1)
+
+# create dataset for lightgbm
+lgb_train = lgb.Dataset(X_train, y_train)
+lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
+# or you can simply use a tuple of length=2 here
+lgb_train = (X_train, y_train)
+lgb_eval = (X_test, y_test)
+
+# specify your configurations as a dict
+params = {
+    'task' : 'train',
+    'boosting_type' : 'gbdt',
+    'objective' : 'regression',
+    'metric' : 'l2',
+    'num_leaves' : 31,
+    'learning_rate' : 0.05,
+    'feature_fraction' : 0.9,
+    'bagging_fraction' : 0.8,
+    'bagging_freq': 5,
+    # 'ndcg_eval_at' : [1, 3, 5, 10],
+    # this metric is not needed in this task, show as an example
+    'verbose' : 0
+}
+
+# train
+gbm = lgb.train(params,
+                lgb_train,
+                num_boost_round=100,
+                valid_datas=lgb_eval,
+                # you can use a list to represent multiple valid_datas/valid_names
+                # don't use tuple, tuple is used to represent one dataset
+                early_stopping_rounds=10)
+
+# save model to file
+gbm.save_model('model.txt')
+
+# load model from file
+gbm = lgb.Booster(model_file='model.txt')
+
+# predict
+y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
+# eval
+print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)
+
+# dump model to json (and save to file)
+model_json = gbm.dump_model()
+
+with open('model.json', 'w+') as f:
+    json.dump(model_json, f, indent=4)
+
--- a/examples/python-guide/sklearn_example.py
+++ b/examples/python-guide/sklearn_example.py
+# coding: utf-8
+# pylint: disable = invalid-name, C0111
+import lightgbm as lgb
+import pandas as pd
+from sklearn.metrics import mean_squared_error
+
+# load or create your dataset
+df_train = pd.read_csv('../regression/regression.train', header=None, sep='\t')
+df_test = pd.read_csv('../regression/regression.test', header=None, sep='\t')
+
+y_train = df_train[0]
+y_test = df_test[0]
+X_train = df_train.drop(0, axis=1)
+X_test = df_test.drop(0, axis=1)
+
+# train
+gbm = lgb.LGBMRegressor(objective='regression',
+                        num_leaves=31,
+                        learning_rate=0.05,
+                        n_estimators=100)
+gbm.fit(X_train, y_train,
+        eval_set=[(X_test, y_test)],
+        early_stopping_rounds=10)
+
+# predict
+y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
+# eval
+print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)
--- a/include/LightGBM/bin.h
+++ b/include/LightGBM/bin.h
@@ -51,6 +51,18 @@ public:
  explicit BinMapper(const void* memory);
  ~BinMapper();

+  bool CheckAlign(const BinMapper& other) const {
+    if (num_bin_ != other.num_bin_) {
+      return false;
+    }
+    for (int i = 0; i < num_bin_; ++i) {
+      if (bin_upper_bound_[i] != other.bin_upper_bound_[i]) {
+        return false;
+      }
+    }
+    return true;
+  }
+
  /*! \brief Get number of bins */
  inline int num_bin() const { return num_bin_; }
  /*! \brief True if bin is trival (contains only one bin) */

--- a/include/LightGBM/boosting.h
+++ b/include/LightGBM/boosting.h
@@ -35,12 +35,34 @@ public:
    const ObjectiveFunction* object_function,
    const std::vector<const Metric*>& training_metrics) = 0;

+  /*!
+  * \brief Merge model from other boosting object
+           Will insert to the front of current boosting object
+  * \param other
+  */
+  virtual void MergeFrom(const Boosting* other) = 0;
+
+  /*!
+  * \brief Reset training data for current boosting
+  * \param config Configs for boosting
+  * \param train_data Training data
+  * \param object_function Training objective function
+  * \param training_metrics Training metric
+  */
+  virtual void ResetTrainingData(const BoostingConfig* config, const Dataset* train_data, const ObjectiveFunction* object_function, const std::vector<const Metric*>& training_metrics) = 0;
+
+  /*!
+  * \brief Reset shrinkage_rate data for current boosting
+  * \param shrinkage_rate Configs for boosting
+  */
+  virtual void ResetShrinkageRate(double shrinkage_rate) = 0;
+
  /*!
  * \brief Add a validation data
  * \param valid_data Validation data
  * \param valid_metrics Metric for validation data
  */
-  virtual void AddDataset(const Dataset* valid_data,
+  virtual void AddValidDataset(const Dataset* valid_data,
    const std::vector<const Metric*>& valid_metrics) = 0;

  /*!
@@ -52,6 +74,19 @@ public:
  */
  virtual bool TrainOneIter(const score_t* gradient, const score_t* hessian, bool is_eval) = 0;

+  /*!
+  * \brief Rollback one iteration
+  */
+  virtual void RollbackOneIter() = 0;
+
+  /*!
+  * \brief return current iteration
+  */
+  virtual int GetCurrentIteration() const = 0;
+
+  /*!
+  * \brief Eval metrics and check is met early stopping or not
+  */
  virtual bool EvalAndCheckEarlyStopping() = 0;
  /*!
  * \brief Get evaluation result at data_idx data
@@ -73,7 +108,7 @@ public:
  * \param result used to store prediction result, should allocate memory before call this function
  * \param out_len lenght of returned score
  */
-  virtual void GetPredictAt(int data_idx, score_t* result, data_size_t* out_len) const = 0;
+  virtual void GetPredictAt(int data_idx, score_t* result, data_size_t* out_len) = 0;

  /*!
  * \brief Prediction for one record, not sigmoid transform
@@ -98,12 +133,18 @@ public:
    const double* feature_values) const = 0;

  /*!
-  * \brief save model to file
-  * \param num_used_model number of model that want to save, -1 means save all
-  * \param is_finish is training finished or not
-  * \param filename filename that want to save to
+  * \brief Dump model to json format string
+  * \return Json format string of model
+  */
+  virtual std::string DumpModel() const = 0;
+
+  /*!
+  * \brief Save model to file
+  * \param num_used_model Number of model that want to save, -1 means save all
+  * \param is_finish Is training finished or not
+  * \param filename Filename that want to save to
  */
-  virtual void SaveModelToFile(int num_used_model, bool is_finish, const char* filename) = 0;
+  virtual void SaveModelToFile(int num_iterations, const char* filename) const = 0;

  /*!
  * \brief Restore from a serialized string
@@ -127,7 +168,7 @@ public:
  * \brief Get number of weak sub-models
  * \return Number of weak sub-models
  */
-  virtual int NumberOfSubModels() const = 0;
+  virtual int NumberOfTotalModel() const = 0;
  
  /*!
  * \brief Get number of classes
@@ -138,7 +179,7 @@ public:
  /*!
  * \brief Set number of used model for prediction
  */
-  virtual void SetNumUsedModel(int num_used_model) = 0;
+  virtual void SetNumIterationForPred(int num_iteration) = 0;
  
  /*!
  * \brief Get Type name of this boosting object
@@ -151,6 +192,8 @@ public:
  /*! \brief Disable copy */
  Boosting(const Boosting&) = delete;

+  static void LoadFileToBoosting(Boosting* boosting, const char* filename);
+
  /*!
  * \brief Create boosting object
  * \param type Type of boosting

--- a/include/LightGBM/c_api.h
+++ b/include/LightGBM/c_api.h
@@ -3,13 +3,15 @@
 #include <cstdint>
 #include <exception>
 #include <stdexcept>
+#include <cstring>
 #include <string>
+
 /*!
 * To avoid type conversion on large data, most of our expose interface support both for float_32 and float_64.
 * Except following:
-* 1. gradients and hessians. 
+* 1. gradients and hessians.
 * 2. Get current score for training data and validation
-* The reason is because they are called frequently, the type-conversion on them maybe time cost. 
+* The reason is because they are called frequently, the type-conversion on them maybe time cost.
 */

 #ifdef __cplusplus
@@ -38,7 +40,7 @@ typedef void* BoosterHandle;

 /*!
 * \brief get string message of the last error
-*  all function in this file will return 0 when success
+*  all function in this file will return 0 when succeed
 *  and -1 when an error occured,
 * \return const char* error inforomation
 */
@@ -53,38 +55,29 @@ DllExport const char* LGBM_GetLastError();
 * \param parameters additional parameters
 * \param reference used to align bin mapper with other dataset, nullptr means don't used
 * \param out a loaded dataset
-* \return 0 when success, -1 when failure happens
+* \return 0 when succeed, -1 when failure happens
 */
-DllExport int LGBM_CreateDatasetFromFile(const char* filename,
+DllExport int LGBM_DatasetCreateFromFile(const char* filename,
  const char* parameters,
  const DatesetHandle* reference,
  DatesetHandle* out);

-/*!
-* \brief load data set from binary file like the command_line LightGBM do
-* \param filename the name of the file
-* \param out a loaded dataset
-* \return 0 when success, -1 when failure happens
-*/
-DllExport int LGBM_CreateDatasetFromBinaryFile(const char* filename,
-  DatesetHandle* out);
-
 /*!
 * \brief create a dataset from CSR format
 * \param indptr pointer to row headers
-* \param indptr_type
+* \param indptr_type type of indptr, can be C_API_DTYPE_INT32 or C_API_DTYPE_INT64
 * \param indices findex
 * \param data fvalue
-* \param data_type
+* \param data_type type of data pointer, can be C_API_DTYPE_FLOAT32 or C_API_DTYPE_FLOAT64
 * \param nindptr number of rows in the matrix + 1
 * \param nelem number of nonzero elements in the matrix
-* \param num_col number of columns; when it's set to 0, then guess from data
+* \param num_col number of columns
 * \param parameters additional parameters
 * \param reference used to align bin mapper with other dataset, nullptr means don't used
 * \param out created dataset
-* \return 0 when success, -1 when failure happens
+* \return 0 when succeed, -1 when failure happens
 */
-DllExport int LGBM_CreateDatasetFromCSR(const void* indptr,
+DllExport int LGBM_DatasetCreateFromCSR(const void* indptr,
  int indptr_type,
  const int32_t* indices,
  const void* data,
@@ -99,19 +92,19 @@ DllExport int LGBM_CreateDatasetFromCSR(const void* indptr,
 /*!
 * \brief create a dataset from CSC format
 * \param col_ptr pointer to col headers
-* \param col_ptr_type
+* \param col_ptr_type type of col_ptr, can be C_API_DTYPE_INT32 or C_API_DTYPE_INT64
 * \param indices findex
 * \param data fvalue
-* \param data_type
-* \param ncol_ptr number of rows in the matrix + 1
+* \param data_type type of data pointer, can be C_API_DTYPE_FLOAT32 or C_API_DTYPE_FLOAT64
+* \param ncol_ptr number of cols in the matrix + 1
 * \param nelem number of nonzero elements in the matrix
-* \param num_row number of rows; when it's set to 0, then guess from data
+* \param num_row number of rows
 * \param parameters additional parameters
 * \param reference used to align bin mapper with other dataset, nullptr means don't used
 * \param out created dataset
-* \return 0 when success, -1 when failure happens
+* \return 0 when succeed, -1 when failure happens
 */
-DllExport int LGBM_CreateDatasetFromCSC(const void* col_ptr,
+DllExport int LGBM_DatasetCreateFromCSC(const void* col_ptr,
  int col_ptr_type,
  const int32_t* indices,
  const void* data,
@@ -126,16 +119,16 @@ DllExport int LGBM_CreateDatasetFromCSC(const void* col_ptr,
 /*!
 * \brief create dataset from dense matrix
 * \param data pointer to the data space
-* \param data_type 0
+* \param data_type type of data pointer, can be C_API_DTYPE_FLOAT32 or C_API_DTYPE_FLOAT64
 * \param nrow number of rows
 * \param ncol number columns
 * \param is_row_major 1 for row major, 0 for column major
 * \param parameters additional parameters
 * \param reference used to align bin mapper with other dataset, nullptr means don't used
 * \param out created dataset
-* \return 0 when success, -1 when failure happens
+* \return 0 when succeed, -1 when failure happens
 */
-DllExport int LGBM_CreateDatasetFromMat(const void* data,
+DllExport int LGBM_DatasetCreateFromMat(const void* data,
  int data_type,
  int32_t nrow,
  int32_t ncol,
@@ -144,9 +137,25 @@ DllExport int LGBM_CreateDatasetFromMat(const void* data,
  const DatesetHandle* reference,
  DatesetHandle* out);

+/*!
+* \brief Create subset of a data
+* \param handle handle of full dataset
+* \param used_row_indices Indices used in subset
+* \param num_used_row_indices len of used_row_indices
+* \param parameters additional parameters
+* \param out subset of data
+* \return 0 when succeed, -1 when failure happens
+*/
+DllExport int LGBM_DatasetGetSubset(
+  const DatesetHandle* handle,
+  const int32_t* used_row_indices,
+  int32_t num_used_row_indices,
+  const char* parameters,
+  DatesetHandle* out);
+
 /*!
 * \brief free space for dataset
-* \return 0 when success, -1 when failure happens
+* \return 0 when succeed, -1 when failure happens
 */
 DllExport int LGBM_DatasetFree(DatesetHandle handle);

@@ -154,19 +163,21 @@ DllExport int LGBM_DatasetFree(DatesetHandle handle);
 * \brief save dateset to binary file
 * \param handle a instance of dataset
 * \param filename file name
-* \return 0 when success, -1 when failure happens
+* \return 0 when succeed, -1 when failure happens
 */
 DllExport int LGBM_DatasetSaveBinary(DatesetHandle handle,
  const char* filename);

 /*!
 * \brief set vector to a content in info
+*        Note: group and group only work for C_API_DTYPE_INT32
+*              label and weight only work for C_API_DTYPE_FLOAT32
 * \param handle a instance of dataset
-* \param field_name field name, can be label, weight, group
+* \param field_name field name, can be label, weight, group, group_id
 * \param field_data pointer to vector
 * \param num_element number of element in field_data
-* \param type float_32:0, int32_t:1
-* \return 0 when success, -1 when failure happens
+* \param type C_API_DTYPE_FLOAT32 or C_API_DTYPE_INT32
+* \return 0 when succeed, -1 when failure happens
 */
 DllExport int LGBM_DatasetSetField(DatesetHandle handle,
  const char* field_name,
@@ -180,8 +191,8 @@ DllExport int LGBM_DatasetSetField(DatesetHandle handle,
 * \param field_name field name
 * \param out_len used to set result length
 * \param out_ptr pointer to the result
-* \param out_type  float_32:0, int32_t:1
-* \return 0 when success, -1 when failure happens
+* \param out_type  C_API_DTYPE_FLOAT32 or C_API_DTYPE_INT32
+* \return 0 when succeed, -1 when failure happens
 */
 DllExport int LGBM_DatasetGetField(DatesetHandle handle,
  const char* field_name,
@@ -193,7 +204,7 @@ DllExport int LGBM_DatasetGetField(DatesetHandle handle,
 * \brief get number of data.
 * \param handle the handle to the dataset
 * \param out The address to hold number of data
-* \return 0 when success, -1 when failure happens
+* \return 0 when succeed, -1 when failure happens
 */
 DllExport int LGBM_DatasetGetNumData(DatesetHandle handle,
  int64_t* out);
@@ -202,7 +213,7 @@ DllExport int LGBM_DatasetGetNumData(DatesetHandle handle,
 * \brief get number of features
 * \param handle the handle to the dataset
 * \param out The output of number of features
-* \return 0 when success, -1 when failure happens
+* \return 0 when succeed, -1 when failure happens
 */
 DllExport int LGBM_DatasetGetNumFeature(DatesetHandle handle,
  int64_t* out);
@@ -212,42 +223,82 @@ DllExport int LGBM_DatasetGetNumFeature(DatesetHandle handle,
 /*!
 * \brief create an new boosting learner
 * \param train_data training data set
-* \param valid_datas validation data sets
-* \param valid_names names of validation data sets
-* \param n_valid_datas number of validation set
 * \param parameters format: 'key1=value1 key2=value2'
 * \prama out handle of created Booster
-* \return 0 when success, -1 when failure happens
+* \return 0 when succeed, -1 when failure happens
 */
 DllExport int LGBM_BoosterCreate(const DatesetHandle train_data,
-  const DatesetHandle valid_datas[],
-  const char* valid_names[],
-  int n_valid_datas,
  const char* parameters,
  BoosterHandle* out);

 /*!
 * \brief load an existing boosting from model file
 * \param filename filename of model
+* \param out_num_iterations number of iterations of this booster
 * \param out handle of created Booster
-* \return 0 when success, -1 when failure happens
+* \return 0 when succeed, -1 when failure happens
 */
-DllExport int LGBM_BoosterLoadFromModelfile(
+DllExport int LGBM_BoosterCreateFromModelfile(
  const char* filename,
+  int64_t* out_num_iterations,
  BoosterHandle* out);

+
 /*!
 * \brief free obj in handle
 * \param handle handle to be freed
-* \return 0 when success, -1 when failure happens
+* \return 0 when succeed, -1 when failure happens
 */
 DllExport int LGBM_BoosterFree(BoosterHandle handle);

+/*!
+* \brief Merge model in two booster to first handle
+* \param handle handle, will merge other handle to this
+* \param other_handle
+* \return 0 when succeed, -1 when failure happens
+*/
+DllExport int LGBM_BoosterMerge(BoosterHandle handle,
+  BoosterHandle other_handle);
+
+/*!
+* \brief Add new validation to booster
+* \param handle handle
+* \param valid_data validation data set
+* \return 0 when succeed, -1 when failure happens
+*/
+DllExport int LGBM_BoosterAddValidData(BoosterHandle handle,
+  const DatesetHandle valid_data);
+
+/*!
+* \brief Reset training data for booster
+* \param handle handle
+* \param train_data training data set
+* \return 0 when succeed, -1 when failure happens
+*/
+DllExport int LGBM_BoosterResetTrainingData(BoosterHandle handle,
+  const DatesetHandle train_data);
+
+/*!
+* \brief Reset config for current booster
+* \param handle handle
+* \param parameters format: 'key1=value1 key2=value2'
+* \return 0 when succeed, -1 when failure happens
+*/
+DllExport int LGBM_BoosterResetParameter(BoosterHandle handle, const char* parameters);
+
+/*!
+* \brief Get number of class 
+* \param handle handle
+* \param out_len number of class
+* \return 0 when succeed, -1 when failure happens
+*/
+DllExport int LGBM_BoosterGetNumClasses(BoosterHandle handle, int64_t* out_len);
+
 /*!
 * \brief update the model in one round
 * \param handle handle
 * \param is_finished 1 means finised(cannot split any more)
-* \return 0 when success, -1 when failure happens
+* \return 0 when succeed, -1 when failure happens
 */
 DllExport int LGBM_BoosterUpdateOneIter(BoosterHandle handle, int* is_finished);

@@ -258,7 +309,7 @@ DllExport int LGBM_BoosterUpdateOneIter(BoosterHandle handle, int* is_finished);
 * \param grad gradient statistics
 * \param hess second order gradient statistics
 * \param is_finished 1 means finised(cannot split any more)
-* \return 0 when success, -1 when failure happens
+* \return 0 when succeed, -1 when failure happens
 */
 DllExport int LGBM_BoosterUpdateOneIterCustom(BoosterHandle handle,
  const float* grad,
@@ -266,81 +317,106 @@ DllExport int LGBM_BoosterUpdateOneIterCustom(BoosterHandle handle,
  int* is_finished);

 /*!
-* \brief get evaluation for training data and validation data
+* \brief Rollback one iteration
 * \param handle handle
-* \param data 0:training data, 1: 1st valid data, 2:2nd valid data ...
-* \param out_len len of output result
-* \param out_result the string containing evaluation statistics, should allocate memory before call this function
-* \return 0 when success, -1 when failure happens
+* \return 0 when succeed, -1 when failure happens
 */
-DllExport int LGBM_BoosterEval(BoosterHandle handle,
-  int data,
-  int64_t* out_len,
-  float* out_results);
+DllExport int LGBM_BoosterRollbackOneIter(BoosterHandle handle);
+
+/*!
+* \brief Get iteration of current boosting rounds
+* \param out_iteration iteration of boosting rounds
+* \return 0 when succeed, -1 when failure happens
+*/
+DllExport int LGBM_BoosterGetCurrentIteration(BoosterHandle handle, int64_t* out_iteration);

 /*!
-* \brief get raw score for training data, used to calculate gradients outside
+* \brief Get number of eval 
+* \param out_len total number of eval results
+* \return 0 when succeed, -1 when failure happens
+*/
+DllExport int LGBM_BoosterGetEvalCounts(BoosterHandle handle, int64_t* out_len);
+
+/*!
+* \brief Get Name of eval
+* \param out_len total number of eval results
+* \param out_strs names of eval result
+* \return 0 when succeed, -1 when failure happens
+*/
+DllExport int LGBM_BoosterGetEvalNames(BoosterHandle handle, int64_t* out_len, char** out_strs);
+
+/*!
+* \brief get evaluation for training data and validation data
+         Note: 1. you should call LGBM_BoosterGetEvalNames first to get the name of evaluation results
+         2. should pre-allocate memory for out_results, you can get its length by LGBM_BoosterGetEvalCounts
 * \param handle handle
+* \param data_idx 0:training data, 1: 1st valid data, 2:2nd valid data ...
 * \param out_len len of output result
-* \param out_result used to set a pointer to array
-* \return 0 when success, -1 when failure happens
+* \param out_result float arrary contains result
+* \return 0 when succeed, -1 when failure happens
 */
-DllExport int LGBM_BoosterGetScore(BoosterHandle handle,
+DllExport int LGBM_BoosterGetEval(BoosterHandle handle,
+  int data_idx,
  int64_t* out_len,
-  const float** out_result);
+  float* out_results);

 /*!
 * \brief Get prediction for training data and validation data
-this can be used to support customized eval function
+         this can be used to support customized eval function
+         Note:  should pre-allocate memory for out_result, its length is equal to num_class * num_data 
 * \param handle handle
-* \param data 0:training data, 1: 1st valid data, 2:2nd valid data ...
+* \param data_idx 0:training data, 1: 1st valid data, 2:2nd valid data ...
 * \param out_len len of output result
 * \param out_result used to set a pointer to array, should allocate memory before call this function
-* \return 0 when success, -1 when failure happens
+* \return 0 when succeed, -1 when failure happens
 */
 DllExport int LGBM_BoosterGetPredict(BoosterHandle handle,
-  int data,
+  int data_idx,
  int64_t* out_len,
  float* out_result);

 /*!
 * \brief make prediction for file
 * \param handle handle
-* \param predict_type
-*          0:raw score
-*          1:with transform(if needed)
-*          2:leaf index
-* \param n_used_trees number of used tree
-* \param data_has_header data file has header or not
 * \param data_filename filename of data file
+* \param data_has_header data file has header or not
+* \param predict_type
+*          C_API_PREDICT_NORMAL: normal prediction, with transform (if needed)
+*          C_API_PREDICT_RAW_SCORE: raw score
+*          C_API_PREDICT_LEAF_INDEX: leaf index
+* \param num_iteration number of iteration for prediction, <= 0 means no limit
 * \param result_filename filename of result file
-* \return 0 when success, -1 when failure happens
+* \return 0 when succeed, -1 when failure happens
 */
 DllExport int LGBM_BoosterPredictForFile(BoosterHandle handle,
-  int predict_type,
-  int64_t n_used_trees,
-  int data_has_header,
  const char* data_filename,
+  int data_has_header,
+  int predict_type,
+  int64_t num_iteration,
  const char* result_filename);

 /*!
 * \brief make prediction for an new data set
+*        Note:  should pre-allocate memory for out_result, 
+*               for noraml and raw score: its length is equal to num_class * num_data
+*               for leaf index, its length is equal to num_class * num_data * num_iteration
 * \param handle handle
 * \param indptr pointer to row headers
-* \param indptr_type 
+* \param indptr_type type of indptr, can be C_API_DTYPE_INT32 or C_API_DTYPE_INT64
 * \param indices findex
 * \param data fvalue
-* \param data_type
+* \param data_type type of data pointer, can be C_API_DTYPE_FLOAT32 or C_API_DTYPE_FLOAT64
 * \param nindptr number of rows in the matrix + 1
 * \param nelem number of nonzero elements in the matrix
 * \param num_col number of columns; when it's set to 0, then guess from data
 * \param predict_type
-*          0:raw score
-*          1:with transform(if needed)
-*          2:leaf index
-* \param n_used_trees number of used tree
+*          C_API_PREDICT_NORMAL: normal prediction, with transform (if needed)
+*          C_API_PREDICT_RAW_SCORE: raw score
+*          C_API_PREDICT_LEAF_INDEX: leaf index
+* \param num_iteration number of iteration for prediction, <= 0 means no limit
+* \param out_len len of output result
 * \param out_result used to set a pointer to array, should allocate memory before call this function
-* \return 0 when success, -1 when failure happens
+* \return 0 when succeed, -1 when failure happens
 */
 DllExport int LGBM_BoosterPredictForCSR(BoosterHandle handle,
  const void* indptr,
@@ -352,24 +428,29 @@ DllExport int LGBM_BoosterPredictForCSR(BoosterHandle handle,
  int64_t nelem,
  int64_t num_col,
  int predict_type,
-  int64_t n_used_trees,
-  double* out_result);
+  int64_t num_iteration,
+  int64_t* out_len,
+  float* out_result);

 /*!
 * \brief make prediction for an new data set
+*        Note:  should pre-allocate memory for out_result,
+*               for noraml and raw score: its length is equal to num_class * num_data
+*               for leaf index, its length is equal to num_class * num_data * num_iteration
 * \param handle handle
 * \param data pointer to the data space
-* \param data_type
+* \param data_type type of data pointer, can be C_API_DTYPE_FLOAT32 or C_API_DTYPE_FLOAT64
 * \param nrow number of rows
 * \param ncol number columns
 * \param is_row_major 1 for row major, 0 for column major
 * \param predict_type
-*          0:raw score
-*          1:with transform(if needed)
-*          2:leaf index
-* \param n_used_trees number of used tree
+*          C_API_PREDICT_NORMAL: normal prediction, with transform (if needed)
+*          C_API_PREDICT_RAW_SCORE: raw score
+*          C_API_PREDICT_LEAF_INDEX: leaf index
+* \param num_iteration number of iteration for prediction, <= 0 means no limit
+* \param out_len len of output result
 * \param out_result used to set a pointer to array, should allocate memory before call this function
-* \return 0 when success, -1 when failure happens
+* \return 0 when succeed, -1 when failure happens
 */
 DllExport int LGBM_BoosterPredictForMat(BoosterHandle handle,
  const void* data,
@@ -378,21 +459,33 @@ DllExport int LGBM_BoosterPredictForMat(BoosterHandle handle,
  int32_t ncol,
  int is_row_major,
  int predict_type,
-  int64_t n_used_trees,
-  double* out_result);
+  int64_t num_iteration,
+  int64_t* out_len,
+  float* out_result);

 /*!
 * \brief save model into file
 * \param handle handle
-* \param num_used_model
+* \param num_iteration, <= 0 means save all
 * \param filename file name
-* \return 0 when success, -1 when failure happens
+* \return 0 when succeed, -1 when failure happens
 */
 DllExport int LGBM_BoosterSaveModel(BoosterHandle handle,
-  int num_used_model,
+  int num_iteration,
  const char* filename);

-
+/*!
+* \brief dump model to json
+* \param handle handle
+* \param buffer_len string buffer length, if buffer_len < out_len, re-allocate buffer
+* \param out_len actual output length
+* \param out_str json format string of model
+* \return 0 when succeed, -1 when failure happens
+*/
+DllExport int LGBM_BoosterDumpModel(BoosterHandle handle,
+  int buffer_len,
+  int64_t* out_len,
+  char** out_str);

 // some help functions used to convert data

@@ -403,23 +496,25 @@ std::function<std::vector<std::pair<int, double>>(int row_idx)>
 RowPairFunctionFromDenseMatric(const void* data, int num_row, int num_col, int data_type, int is_row_major);

 std::function<std::vector<std::pair<int, double>>(int idx)>
-RowFunctionFromCSR(const void* indptr, int indptr_type, const int32_t* indices, 
+RowFunctionFromCSR(const void* indptr, int indptr_type, const int32_t* indices,
  const void* data, int data_type, int64_t nindptr, int64_t nelem);

 std::function<std::vector<std::pair<int, double>>(int idx)>
-ColumnFunctionFromCSC(const void* col_ptr, int col_ptr_type, const int32_t* indices, 
+ColumnFunctionFromCSC(const void* col_ptr, int col_ptr_type, const int32_t* indices,
  const void* data, int data_type, int64_t ncol_ptr, int64_t nelem);

-std::vector<double> 
+std::vector<double>
 SampleFromOneColumn(const std::vector<std::pair<int, double>>& data, const std::vector<int>& indices);

-
+#if defined(_MSC_VER)
 // exception handle and error msg
-
-static std::string& LastErrorMsg() { static std::string err_msg("Everything is fine"); return err_msg; }
+static char* LastErrorMsg() { static __declspec(thread) char err_msg[512] = "Everything is fine"; return err_msg; }
+#else
+static char* LastErrorMsg() { static thread_local char err_msg[512] = "Everything is fine"; return err_msg; }
+#endif

 inline void LGBM_SetLastError(const char* msg) {
-  LastErrorMsg() = msg;
+  std::strcpy(LastErrorMsg(), msg);
 }

 inline int LGBM_APIHandleException(const std::exception& ex) {
@@ -437,6 +532,6 @@ inline int LGBM_APIHandleException(const std::string& ex) {
 catch(std::exception& ex) { return LGBM_APIHandleException(ex); } \
 catch(std::string& ex) { return LGBM_APIHandleException(ex); } \
 catch(...) { return LGBM_APIHandleException("unknown exception"); } \
-return 0;  
+return 0;

 #endif // LIGHTGBM_C_API_H_
--- a/include/LightGBM/config.h
+++ b/include/LightGBM/config.h
@@ -72,6 +72,8 @@ public:
  inline bool GetBool(
    const std::unordered_map<std::string, std::string>& params,
    const std::string& name, bool* out);
+
+  static std::unordered_map<std::string, std::string> Str2Map(const char* parameters);
 };

 /*! \brief Types of boosting */
@@ -97,7 +99,7 @@ public:
  std::string output_result = "LightGBM_predict_result.txt";
  std::string input_model = "";
  int verbosity = 1;
-  int num_model_predict = NO_LIMIT;
+  int num_iteration_predict = -1;
  bool is_pre_partition = false;
  bool is_enable_sparse = true;
  bool use_two_round_loading = false;
@@ -136,6 +138,8 @@ public:
  bool is_unbalance = false;
  // for multiclass
  int num_class = 1;
+  // Balancing of positive and negative weights
+  double scale_pos_weight = 1.0f;
  void Set(const std::unordered_map<std::string, std::string>& params) override;
 };

@@ -164,12 +168,12 @@ public:
  int feature_fraction_seed = 2;
  double feature_fraction = 1.0f;
  // max cache size(unit:MB) for historical histogram. < 0 means not limit
-  double histogram_pool_size = NO_LIMIT;
+  double histogram_pool_size = -1.0f;
  // max depth of tree model.
  // Still grow tree by leaf-wise, but limit the max depth to avoid over-fitting
  // And the max leaves will be min(num_leaves, pow(2, max_depth - 1))
  // max_depth < 0 means not limit
-  int max_depth = NO_LIMIT;
+  int max_depth = -1;
  void Set(const std::unordered_map<std::string, std::string>& params) override;
 };

@@ -231,7 +235,7 @@ public:
  MetricConfig metric_config;

  void Set(const std::unordered_map<std::string, std::string>& params) override;
-  void LoadFromString(const char* str);
+
 private:
  void GetBoostingType(const std::unordered_map<std::string, std::string>& params);

@@ -328,17 +332,22 @@ struct ParameterAlias {
      { "ndcg_at", "ndcg_eval_at" },
      { "min_data_per_leaf", "min_data_in_leaf" },
      { "min_data", "min_data_in_leaf" },
+      { "min_child_samples", "min_data_in_leaf" },
      { "min_sum_hessian_per_leaf", "min_sum_hessian_in_leaf" },
      { "min_sum_hessian", "min_sum_hessian_in_leaf" },
      { "min_hessian", "min_sum_hessian_in_leaf" },
+      { "min_child_weight", "min_sum_hessian_in_leaf" },
      { "num_leaf", "num_leaves" },
      { "sub_feature", "feature_fraction" },
+      { "colsample_bytree", "feature_fraction" },
      { "num_iteration", "num_iterations" },
      { "num_tree", "num_iterations" },
      { "num_round", "num_iterations" },
      { "num_trees", "num_iterations" },
      { "num_rounds", "num_iterations" },
      { "sub_row", "bagging_fraction" },
+      { "subsample", "bagging_fraction" },
+      { "subsample_freq", "bagging_freq" },
      { "shrinkage_rate", "learning_rate" },
      { "tree", "tree_learner" },
      { "num_machine", "num_machines" },
@@ -361,6 +370,9 @@ struct ParameterAlias {
      { "blacklist", "ignore_column" },
      { "predict_raw_score", "is_predict_raw_score" },
      { "predict_leaf_index", "is_predict_leaf_index" }, 
+      { "min_split_gain", "min_gain_to_split" },
+      { "reg_alpha", "lambda_l1" },
+      { "reg_lambda", "lambda_l2" },
      { "num_classes", "num_class" }
    });
    std::unordered_map<std::string, std::string> tmp_map;

--- a/include/LightGBM/dataset.h
+++ b/include/LightGBM/dataset.h
@@ -13,6 +13,7 @@
 #include <functional>
 #include <string>
 #include <unordered_set>
+#include <mutex>

 namespace LightGBM {

@@ -46,6 +47,13 @@ public:
  */
  void Init(const char* data_filename, const int num_class);
  /*!
+  * \brief init as subset
+  * \param metadata Filename of data
+  * \param used_indices 
+  * \param num_used_indices
+  */
+  void Init(const Metadata& metadata, const data_size_t* used_indices, data_size_t num_used_indices);
+  /*!
  * \brief Initial with binary memory
  * \param memory Pointer to memory
  */
@@ -76,13 +84,14 @@ public:
  void CheckOrPartition(data_size_t num_all_data,
    const std::vector<data_size_t>& used_data_indices);

-
  void SetLabel(const float* label, data_size_t len);

  void SetWeights(const float* weights, data_size_t len);

  void SetQueryBoundaries(const data_size_t* query_boundaries, data_size_t len);

+  void SetQueryId(const data_size_t* query_id, data_size_t len);
+
  /*!
  * \brief Set initial scores
  * \param init_score Initial scores, this class will manage memory for init_score.
@@ -141,8 +150,13 @@ public:
  * \brief Get weights, if not exists, will return nullptr
  * \return Pointer of weights
  */
-  inline const float* weights()
-            const { return weights_.data(); }
+  inline const float* weights() const {
+    if (weights_.size() > 0) {
+      return weights_.data();
+    } else {
+      return nullptr;
+    }
+  }

  /*!
  * \brief Get data boundaries on queries, if not exists, will return nullptr
@@ -151,8 +165,13 @@ public:
  *        is the data indices for query i.
  * \return Pointer of data boundaries on queries
  */
-  inline const data_size_t* query_boundaries()
-           const { return query_boundaries_.data(); }
+  inline const data_size_t* query_boundaries() const { 
+    if (query_boundaries_.size() > 0) {
+      return query_boundaries_.data();
+    } else {
+      return nullptr;
+    }
+  }

  /*!
  * \brief Get Number of queries
@@ -164,13 +183,25 @@ public:
  * \brief Get weights for queries, if not exists, will return nullptr
  * \return Pointer of weights for queries
  */
-  inline const float* query_weights() const { return query_weights_.data(); }
+  inline const float* query_weights() const { 
+    if (query_weights_.size() > 0) {
+      return query_weights_.data();
+    } else {
+      return nullptr;
+    }
+  }

  /*!
  * \brief Get initial scores, if not exists, will return nullptr
  * \return Pointer of initial scores
  */
-  inline const float* init_score() const { return init_score_.data(); }
+  inline const float* init_score() const { 
+    if (init_score_.size() > 0) {
+      return init_score_.data();
+    } else {
+      return nullptr;
+    }
+  }

  /*! \brief Disable copy */
  Metadata& operator=(const Metadata&) = delete;
@@ -210,6 +241,8 @@ private:
  std::vector<float> init_score_;
  /*! \brief Queries data */
  std::vector<data_size_t> queries_;
+  /*! \brief mutex for threading safe call */
+  std::mutex mutex_;
 };


@@ -253,6 +286,27 @@ public:
  /*! \brief Destructor */
  ~Dataset();

+  bool CheckAlign(const Dataset& other) const {
+    if (num_features_ != other.num_features_) {
+      return false;
+    }
+    if (num_total_features_ != other.num_total_features_) {
+      return false;
+    }
+    if (num_class_ != other.num_class_) {
+      return false;
+    }
+    if (label_idx_ != other.label_idx_) {
+      return false;
+    }
+    for (int i = 0; i < num_features_; ++i) {
+      if (!features_[i]->CheckAlign(*(other.features_[i].get()))) {
+        return false;
+      }
+    }
+    return true;
+  }
+
  inline void PushOneRow(int tid, data_size_t row_idx, const std::vector<double>& feature_values) {
    for (size_t i = 0; i < feature_values.size() && i < static_cast<size_t>(num_total_features_); ++i) {
      int feature_idx = used_feature_map_[i];
@@ -282,6 +336,8 @@ public:
    }
  }

+  Dataset* Subset(const data_size_t* used_indices, data_size_t num_used_indices, bool is_enable_sparse) const;
+
  void FinishLoad();

  bool SetFloatField(const char* field_name, const float* field_data, data_size_t num_element);
@@ -348,12 +404,12 @@ private:
  int num_class_;
  /*! \brief Store some label level data*/
  Metadata metadata_;
-  /*! \brief True if dataset is loaded from binary file */
-  bool is_loading_from_binfile_;
  /*! \brief index of label column */
  int label_idx_ = 0;
  /*! \brief store feature names */
  std::vector<std::string> feature_names_;
+  /*! \brief store feature names */
+  static const char* binary_file_token;
 };

 }  // namespace LightGBM

--- a/include/LightGBM/dataset_loader.h
+++ b/include/LightGBM/dataset_loader.h
@@ -49,7 +49,7 @@ private:
  void ExtractFeaturesFromFile(const char* filename, const Parser* parser, const std::vector<data_size_t>& used_data_indices, Dataset* dataset);

  /*! \brief Check can load from binary file */
-  bool CheckCanLoadFromBin(const char* filename);
+  std::string CheckCanLoadFromBin(const char* filename);

  const IOConfig& io_config_;
  /*! \brief Random generator*/

--- a/include/LightGBM/feature.h
+++ b/include/LightGBM/feature.h
@@ -63,6 +63,13 @@ public:
  ~Feature() {
  }

+  bool CheckAlign(const Feature& other) const {
+    if (feature_index_ != other.feature_index_) {
+      return false;
+    }
+    return bin_mapper_->CheckAlign(*(other.bin_mapper_.get()));
+  }
+
  /*!
  * \brief Push one record, will auto convert to bin and push to bin data
  * \param tid Thread id
@@ -73,6 +80,9 @@ public:
    unsigned int bin = bin_mapper_->ValueToBin(value);
    bin_data_->Push(tid, line_idx, bin);
  }
+  inline void PushBin(int tid, data_size_t line_idx, unsigned int bin) {
+    bin_data_->Push(tid, line_idx, bin);
+  }
  inline void FinishLoad() { bin_data_->FinishLoad(); }
  /*! \brief Index of this feature */
  inline int feature_index() const { return feature_index_; }

--- a/include/LightGBM/meta.h
+++ b/include/LightGBM/meta.h
@@ -24,7 +24,6 @@ using ReduceFunction = std::function<void(const char*, char*, int)>;
 using PredictFunction =
 std::function<std::vector<double>(const std::vector<std::pair<int, double>>&)>;

-#define NO_LIMIT (-1)
 #define NO_SPECIFIC (-1)

 }  // namespace LightGBM

--- a/include/LightGBM/metric.h
+++ b/include/LightGBM/metric.h
@@ -24,8 +24,7 @@ public:
  * \param metadata Label data
  * \param num_data Number of data
  */
-  virtual void Init(const char* test_name,
-    const Metadata& metadata, data_size_t num_data) = 0;
+  virtual void Init(const Metadata& metadata, data_size_t num_data) = 0;

  virtual const std::vector<std::string>& GetName() const = 0;


--- a/include/LightGBM/tree.h
+++ b/include/LightGBM/tree.h
@@ -98,13 +98,12 @@ public:
    }
  }

-  /*! \brief Serialize this object by string*/
+  /*! \brief Serialize this object to string*/
  std::string ToString();

-  /*! \brief Disable copy */
-  Tree& operator=(const Tree&) = delete;
-  /*! \brief Disable copy */
-  Tree(const Tree&) = delete;
+  /*! \brief Serialize this object to json*/
+  std::string ToJSON();
+
 private:
  /*!
  * \brief Find leaf index of which record belongs by data
@@ -122,6 +121,9 @@ private:
  */
  inline int GetLeaf(const double* feature_values) const;

+  /*! \brief Serialize one node to json*/
+  inline std::string NodeToJSON(int index);
+
  /*! \brief Number of max leaves*/
  int max_leaves_;
  /*! \brief Number of current levas*/
@@ -141,13 +143,13 @@ private:
  std::vector<double> threshold_;
  /*! \brief A non-leaf node's split gain */
  std::vector<double> split_gain_;
+  /*! \brief Output of internal nodes(save internal output for per inference feature importance calc) */
+  std::vector<double> internal_value_;
  // used for leaf node
  /*! \brief The parent of leaf */
  std::vector<int> leaf_parent_;
  /*! \brief Output of leaves */
  std::vector<double> leaf_value_;
-  /*! \brief Output of internal nodes(save internal output for per inference feature importance calc) */
-  std::vector<double> internal_value_;
  /*! \brief Depth for leaves */
  std::vector<int> leaf_depth_;
 };

--- a/include/LightGBM/utils/log.h
+++ b/include/LightGBM/utils/log.h
@@ -89,7 +89,11 @@ private:

  // a trick to use static variable in header file. 
  // May be not good, but avoid to use an additional cpp file
-  static LogLevel& GetLevel() { static LogLevel level; return level; }
+#if defined(_MSC_VER)
+  static LogLevel& GetLevel() { static __declspec(thread) LogLevel level = LogLevel::Info; return level; }
+#else
+  static LogLevel& GetLevel() { static thread_local LogLevel level = LogLevel::Info; return level; }
+#endif
  
 };


--- a/python-package/README.rst
+++ b/python-package/README.rst
+LightGBM Python Package
+=======================
+
+Installation
+------------
+
+1. Following `Installation Guide <https://github.com/Microsoft/LightGBM/wiki/Installation-Guide>`__ to build first.
+   For the windows user, please change the build config to ``DLL``.
+2. Install with ``cd python-package; python setup.py install`` 
+
+Note: Make sure you have `setuptools <https://pypi.python.org/pypi/setuptools>`__
+
+
+
+Examples
+--------
+
+-  Refer also to the walk through examples in `python-guide
+   folder <https://github.com/Microsoft/LightGBM/tree/master/examples/python-guide>`__
--- a/python-package/lightgbm/__init__.py
+++ b/python-package/lightgbm/__init__.py
+# coding: utf-8
+"""LightGBM, Light Gradient Boosting Machine.
+
+Contributors: https://github.com/Microsoft/LightGBM/graphs/contributors
+"""
+
+from __future__ import absolute_import
+
+import os
+
+from .basic import Predictor, Dataset, Booster
+from .engine import train, cv
+try:
+    from .sklearn import LGBMModel, LGBMRegressor, LGBMClassifier, LGBMRanker
+except ImportError:
+    pass
+
+
+__version__ = 0.1
+
+__all__ = ['Dataset', 'Booster',
+           'train', 'cv',
+           'LGBMModel', 'LGBMRegressor', 'LGBMClassifier', 'LGBMRanker']
--- a/python-package/lightgbm/basic.py
+++ b/python-package/lightgbm/basic.py
+# coding: utf-8
+# pylint: disable = invalid-name, C0111, R0912, R0913, R0914, W0105
+"""Wrapper c_api of LightGBM"""
+from __future__ import absolute_import
+
+import sys
+import os
+import ctypes
+import tempfile
+import json
+
+import numpy as np
+import scipy.sparse
+
+from .libpath import find_lib_path
+
+# pandas
+try:
+    from pandas import Series, DataFrame
+    IS_PANDAS_INSTALLED = True
+except ImportError:
+    IS_PANDAS_INSTALLED = False
+    class Series(object):
+        pass
+    class DataFrame(object):
+        pass
+
+IS_PY3 = (sys.version_info[0] == 3)
+
+def _load_lib():
+    """Load LightGBM Library."""
+    lib_path = find_lib_path()
+    if len(lib_path) == 0:
+        raise Exception("cannot find LightGBM library")
+    lib = ctypes.cdll.LoadLibrary(lib_path[0])
+    lib.LGBM_GetLastError.restype = ctypes.c_char_p
+    return lib
+
+_LIB = _load_lib()
+
+class LightGBMError(Exception):
+    """Error throwed by LightGBM"""
+    pass
+
+def _safe_call(ret):
+    """Check the return value of C API call
+    Parameters
+    ----------
+    ret : int
+        return value from API calls
+    """
+    if ret != 0:
+        raise LightGBMError(_LIB.LGBM_GetLastError())
+
+def is_str(s):
+    if IS_PY3:
+        return isinstance(s, str)
+    else:
+        return isinstance(s, basestring)
+
+def is_numpy_object(data):
+    return type(data).__module__ == np.__name__
+
+def is_numpy_1d_array(data):
+    if isinstance(data, np.ndarray) and len(data.shape) == 1:
+        return True
+    else:
+        return False
+
+def is_1d_list(data):
+    if not isinstance(data, list):
+        return False
+    if len(data) > 0:
+        if not isinstance(data[0], (int, float, bool)):
+            return False
+    return True
+
+def list_to_1d_numpy(data, dtype):
+    if is_numpy_1d_array(data):
+        if data.dtype == dtype:
+            return data
+        else:
+            return data.astype(dtype=dtype, copy=False)
+    elif is_1d_list(data):
+        return np.array(data, dtype=dtype, copy=False)
+    elif IS_PANDAS_INSTALLED and isinstance(data, Series):
+        return data.astype(dtype).values
+    else:
+        raise TypeError("Unknow type({})".format(type(data).__name__))
+
+def cfloat32_array_to_numpy(cptr, length):
+    """Convert a ctypes float pointer array to a numpy array.
+    """
+    if isinstance(cptr, ctypes.POINTER(ctypes.c_float)):
+        res = np.fromiter(cptr, dtype=np.float32, count=length)
+        return res
+    else:
+        raise RuntimeError('expected float pointer')
+
+def cint32_array_to_numpy(cptr, length):
+    """Convert a ctypes float pointer array to a numpy array.
+    """
+    if isinstance(cptr, ctypes.POINTER(ctypes.c_int32)):
+        res = np.fromiter(cptr, dtype=np.int32, count=length)
+        return res
+    else:
+        raise RuntimeError('expected int pointer')
+
+def c_str(string):
+    """Convert a python string to cstring."""
+    return ctypes.c_char_p(string.encode('utf-8'))
+
+def c_array(ctype, values):
+    """Convert a python array to c array."""
+    return (ctype * len(values))(*values)
+
+def param_dict_to_str(data):
+    if data is None or len(data) == 0:
+        return ""
+    pairs = []
+    for key, val in data.items():
+        if is_str(val):
+            pairs.append(str(key)+'='+str(val))
+        elif isinstance(val, (list, tuple)):
+            pairs.append(str(key)+'='+','.join(map(str, val)))
+        elif isinstance(val, (int, float, bool)):
+            pairs.append(str(key)+'='+str(val))
+        else:
+            raise TypeError('unknow type of parameter:%s , got:%s'
+                            % (key, type(val).__name__))
+    return ' '.join(pairs)
+"""marco definition of data type in c_api of LightGBM"""
+C_API_DTYPE_FLOAT32 = 0
+C_API_DTYPE_FLOAT64 = 1
+C_API_DTYPE_INT32 = 2
+C_API_DTYPE_INT64 = 3
+"""Matric is row major in python"""
+C_API_IS_ROW_MAJOR = 1
+
+C_API_PREDICT_NORMAL = 0
+C_API_PREDICT_RAW_SCORE = 1
+C_API_PREDICT_LEAF_INDEX = 2
+
+FIELD_TYPE_MAPPER = {"label": C_API_DTYPE_FLOAT32,
+                     "weight": C_API_DTYPE_FLOAT32,
+                     "init_score": C_API_DTYPE_FLOAT32,
+                     "group": C_API_DTYPE_INT32}
+
+def c_float_array(data):
+    """Convert numpy array / list to c float array."""
+    if is_1d_list(data):
+        data = np.array(data, copy=False)
+    if is_numpy_1d_array(data):
+        if data.dtype == np.float32:
+            ptr_data = data.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
+            type_data = C_API_DTYPE_FLOAT32
+        elif data.dtype == np.float64:
+            ptr_data = data.ctypes.data_as(ctypes.POINTER(ctypes.c_double))
+            type_data = C_API_DTYPE_FLOAT64
+        else:
+            raise TypeError("expected np.float32 or np.float64, met type({})"
+                            .format(data.dtype))
+    else:
+        raise TypeError("Unknow type({})".format(type(data).__name__))
+    return (ptr_data, type_data)
+
+def c_int_array(data):
+    """Convert numpy array to c int array."""
+    if is_1d_list(data):
+        data = np.array(data, copy=False)
+    if is_numpy_1d_array(data):
+        if data.dtype == np.int32:
+            ptr_data = data.ctypes.data_as(ctypes.POINTER(ctypes.c_int32))
+            type_data = C_API_DTYPE_INT32
+        elif data.dtype == np.int64:
+            ptr_data = data.ctypes.data_as(ctypes.POINTER(ctypes.c_int64))
+            type_data = C_API_DTYPE_INT64
+        else:
+            raise TypeError("expected np.int32 or np.int64, met type({})"
+                            .format(data.dtype))
+    else:
+        raise TypeError("Unknow type({})".format(type(data).__name__))
+    return (ptr_data, type_data)
+
+class Predictor(object):
+    """"A Predictor of LightGBM.
+    """
+    def __init__(self, model_file=None, booster_handle=None, is_manage_handle=True):
+        """Initialize the Predictor.
+
+        Parameters
+        ----------
+        model_file : string
+            Path to the model file.
+        """
+        self.handle = ctypes.c_void_p()
+        self.__is_manage_handle = True
+        if model_file is not None:
+            """Prediction task"""
+            out_num_iterations = ctypes.c_int64(0)
+            _safe_call(_LIB.LGBM_BoosterCreateFromModelfile(
+                c_str(model_file),
+                ctypes.byref(out_num_iterations),
+                ctypes.byref(self.handle)))
+            out_num_class = ctypes.c_int64(0)
+            _safe_call(_LIB.LGBM_BoosterGetNumClasses(
+                self.handle,
+                ctypes.byref(out_num_class)))
+            self.num_class = out_num_class.value
+            self.__num_total_iteration = out_num_iterations.value
+        elif booster_handle is not None:
+            self.__is_manage_handle = is_manage_handle
+            self.handle = booster_handle
+            out_num_class = ctypes.c_int64(0)
+            _safe_call(_LIB.LGBM_BoosterGetNumClasses(
+                self.handle,
+                ctypes.byref(out_num_class)))
+            self.num_class = out_num_class.value
+            out_num_iterations = ctypes.c_int64(0)
+            _safe_call(_LIB.LGBM_BoosterGetCurrentIteration(
+                self.handle,
+                ctypes.byref(out_num_iterations)))
+            self.__num_total_iteration = out_num_iterations.value
+        else:
+            raise TypeError('Need Model file to create a booster')
+
+    def __del__(self):
+        if self.__is_manage_handle:
+            _safe_call(_LIB.LGBM_BoosterFree(self.handle))
+
+
+    def predict(self, data, num_iteration=-1,
+                raw_score=False, pred_leaf=False, data_has_header=False,
+                is_reshape=True):
+        """
+        Predict logic
+
+        Parameters
+        ----------
+        data : string/numpy array/scipy.sparse
+            Data source for prediction
+            When data is string type, it represents the path of txt file,
+        num_iteration : int
+            used iteration for prediction
+        raw_score : bool
+            True for predict raw score
+        pred_leaf : bool
+            True for predict leaf index
+        data_has_header : bool
+            Used for txt data
+        is_reshape : bool
+            True for reshape to [nrow, ...]
+
+        Returns
+        -------
+        Prediction result
+        """
+        if isinstance(data, Dataset):
+            raise TypeError("cannot use Dataset instance for prediction, \
+                            please use raw data instead")
+        predict_type = C_API_PREDICT_NORMAL
+        if raw_score:
+            predict_type = C_API_PREDICT_RAW_SCORE
+        if pred_leaf:
+            predict_type = C_API_PREDICT_LEAF_INDEX
+        int_data_has_header = 1 if data_has_header else 0
+        if num_iteration > self.__num_total_iteration:
+            num_iteration = self.__num_total_iteration
+        if is_str(data):
+            tmp_pred_fname = tempfile.NamedTemporaryFile(prefix="lightgbm_tmp_pred_").name
+            _safe_call(_LIB.LGBM_BoosterPredictForFile(
+                self.handle,
+                c_str(data),
+                int_data_has_header,
+                predict_type,
+                num_iteration,
+                c_str(tmp_pred_fname)))
+            tmp_file = open(tmp_pred_fname, "r")
+            lines = tmp_file.readlines()
+            tmp_file.close()
+            nrow = len(lines)
+            preds = []
+            for line in lines:
+                for token in line.split('\t'):
+                    preds.append(float(token))
+            preds = np.array(preds, copy=False)
+            os.remove(tmp_pred_fname)
+        elif isinstance(data, scipy.sparse.csr_matrix):
+            preds, nrow = self.__pred_for_csr(data, num_iteration,
+                                              predict_type)
+        elif isinstance(data, np.ndarray):
+            preds, nrow = self.__pred_for_np2d(data, num_iteration,
+                                               predict_type)
+        else:
+            try:
+                csr = scipy.sparse.csr_matrix(data)
+                preds, nrow = self.__pred_for_csr(csr, num_iteration,
+                                                  predict_type)
+            except:
+                raise TypeError('can not predict data for type {}'.
+                                format(type(data).__name__))
+        if pred_leaf:
+            preds = preds.astype(np.int32)
+        if preds.size != nrow and is_reshape:
+            if preds.size % nrow == 0:
+                ncol = int(preds.size / nrow)
+                preds = preds.reshape(nrow, ncol)
+            else:
+                raise ValueError('len of predict result(%d) cannot be divide nrow (%d)'
+                                 % (preds.size, nrow))
+        return preds
+
+    def __get_num_preds(self, num_iteration, nrow, predict_type):
+        n_preds = self.num_class * nrow
+        if predict_type == C_API_PREDICT_LEAF_INDEX:
+            if num_iteration > 0:
+                n_preds *= min(num_iteration, self.__num_total_iteration)
+            else:
+                n_preds *= self.__num_total_iteration
+        return n_preds
+
+    def __pred_for_np2d(self, mat, num_iteration, predict_type):
+        """
+        Predict for a 2-D numpy matrix.
+        """
+        if len(mat.shape) != 2:
+            raise ValueError('Input numpy.ndarray must be 2 dimensional')
+
+        if mat.dtype == np.float32 or mat.dtype == np.float64:
+            data = np.array(mat.reshape(mat.size), dtype=mat.dtype, copy=False)
+        else:
+            """change non-float data to float data, need to copy"""
+            data = np.array(mat.reshape(mat.size), dtype=np.float32)
+        ptr_data, type_ptr_data = c_float_array(data)
+        n_preds = self.__get_num_preds(num_iteration, mat.shape[0],
+                                       predict_type)
+        preds = np.zeros(n_preds, dtype=np.float32)
+        out_num_preds = ctypes.c_int64(0)
+        _safe_call(_LIB.LGBM_BoosterPredictForMat(
+            self.handle,
+            ptr_data,
+            type_ptr_data,
+            mat.shape[0],
+            mat.shape[1],
+            C_API_IS_ROW_MAJOR,
+            predict_type,
+            num_iteration,
+            ctypes.byref(out_num_preds),
+            preds.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
+            ))
+        if n_preds != out_num_preds.value:
+            raise ValueError("incorrect number for predict result")
+        return preds, mat.shape[0]
+
+    def __pred_for_csr(self, csr, num_iteration, predict_type):
+        """
+        Predict for a csr data
+        """
+        nrow = len(csr.indptr) - 1
+        n_preds = self.__get_num_preds(num_iteration, nrow, predict_type)
+        preds = np.zeros(n_preds, dtype=np.float32)
+        out_num_preds = ctypes.c_int64(0)
+
+        ptr_indptr, type_ptr_indptr = c_int_array(csr.indptr)
+        ptr_data, type_ptr_data = c_float_array(csr.data)
+
+        _safe_call(_LIB.LGBM_BoosterPredictForCSR(
+            self.handle,
+            ptr_indptr,
+            type_ptr_indptr,
+            csr.indices.ctypes.data_as(ctypes.POINTER(ctypes.c_int32)),
+            ptr_data,
+            type_ptr_data,
+            len(csr.indptr),
+            len(csr.data),
+            csr.shape[1],
+            predict_type,
+            num_iteration,
+            ctypes.byref(out_num_preds),
+            preds.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
+            ))
+        if n_preds != out_num_preds.value:
+            raise ValueError("incorrect number for predict result")
+        return preds, nrow
+
+PANDAS_DTYPE_MAPPER = {'int8': 'int', 'int16': 'int', 'int32': 'int',
+                       'int64': 'int', 'uint8': 'int', 'uint16': 'int',
+                       'uint32': 'int', 'uint64': 'int', 'float16': 'float',
+                       'float32': 'float', 'float64': 'float', 'bool': 'i'}
+
+def _data_from_pandas(data):
+    if isinstance(data, DataFrame):
+        data_dtypes = data.dtypes
+        if not all(dtype.name in PANDAS_DTYPE_MAPPER for dtype in data_dtypes):
+            bad_fields = [data.columns[i] for i, dtype in
+                          enumerate(data_dtypes) if dtype.name not in PANDAS_DTYPE_MAPPER]
+
+            msg = """DataFrame.dtypes for data must be int, float or bool. Did not expect the data types in fields """
+            raise ValueError(msg + ', '.join(bad_fields))
+        data = data.values.astype('float')
+    return data
+
+def _label_from_pandas(label):
+    if isinstance(label, DataFrame):
+        if len(label.columns) > 1:
+            raise ValueError('DataFrame for label cannot have multiple columns')
+        label_dtypes = label.dtypes
+        if not all(dtype.name in PANDAS_DTYPE_MAPPER for dtype in label_dtypes):
+            raise ValueError('DataFrame.dtypes for label must be int, float or bool')
+        label = label.values.astype('float')
+    return label
+
+class Dataset(object):
+    """Dataset used in LightGBM.
+
+    Dataset is a internal data structure that used by LightGBM
+    """
+
+    def __init__(self, data, label=None, max_bin=255, reference=None,
+                 weight=None, group=None, predictor=None,
+                 silent=False, params=None):
+        """
+        Dataset used in LightGBM.
+
+        Parameters
+        ----------
+        data : string/numpy array/scipy.sparse
+            Data source of Dataset.
+            When data is string type, it represents the path of txt file,
+        label : list or numpy 1-D array, optional
+            Label of the data
+        max_bin : int, required
+            max number of discrete bin for features
+        reference : Other Dataset, optional
+            If this dataset validation, need to use training data as reference
+        weight : list or numpy 1-D array , optional
+            Weight for each instance.
+        group : list or numpy 1-D array , optional
+            group/query size for dataset
+        silent : boolean, optional
+            Whether print messages during construction
+        params: dict, optional
+            other parameters
+        """
+        self.__label = None
+        self.__weight = None
+        self.__init_score = None
+        self.__group = None
+        if data is None:
+            self.handle = None
+            return
+        data = _data_from_pandas(data)
+        label = _label_from_pandas(label)
+        self.data_has_header = False
+        """process for args"""
+        params = {} if params is None else params
+        self.max_bin = max_bin
+        self.predictor = predictor
+        params["max_bin"] = max_bin
+        if silent:
+            params["verbose"] = 0
+        elif "verbose" not in params:
+            params["verbose"] = 1
+        params_str = param_dict_to_str(params)
+        """process for reference dataset"""
+        ref_dataset = None
+        if isinstance(reference, Dataset):
+            ref_dataset = ctypes.byref(reference.handle)
+        elif reference is not None:
+            raise TypeError('Reference dataset should be None or dataset instance')
+        """start construct data"""
+        if is_str(data):
+            """check data has header or not"""
+            if "has_header" in params or "header" in params:
+                if params["has_header"].lower() == "true" or params["header"].lower() == "true":
+                    self.data_has_header = True
+            self.handle = ctypes.c_void_p()
+            _safe_call(_LIB.LGBM_DatasetCreateFromFile(
+                c_str(data),
+                c_str(params_str),
+                ref_dataset,
+                ctypes.byref(self.handle)))
+        elif isinstance(data, scipy.sparse.csr_matrix):
+            self.__init_from_csr(data, params_str, ref_dataset)
+        elif isinstance(data, np.ndarray):
+            self.__init_from_np2d(data, params_str, ref_dataset)
+        else:
+            try:
+                csr = scipy.sparse.csr_matrix(data)
+                self.__init_from_csr(csr, params_str, ref_dataset)
+            except:
+                raise TypeError('can not initialize Dataset from {}'.format(type(data).__name__))
+        if label is not None:
+            self.set_label(label)
+        if self.get_label() is None:
+            raise ValueError("label should not be None")
+        if weight is not None:
+            self.set_weight(weight)
+        if group is not None:
+            self.set_group(group)
+        # load init score
+        if self.predictor is not None and isinstance(self.predictor, Predictor):
+            init_score = self.predictor.predict(data,
+                                                raw_score=True,
+                                                data_has_header=self.data_has_header,
+                                                is_reshape=False)
+            if self.predictor.num_class > 1:
+                # need re group init score
+                new_init_score = np.zeros(init_score.size(), dtype=np.float32)
+                num_data = self.num_data()
+                for i in range(num_data):
+                    for j in range(self.predictor.num_class):
+                        new_init_score[j * num_data + i] = init_score[i * self.predictor.num_class + j]
+                init_score = new_init_score
+            self.set_init_score(init_score)
+
+    def create_valid(self, data, label=None, weight=None, group=None,
+                     silent=False, params=None):
+        """
+        Create validation data align with current dataset
+
+        Parameters
+        ----------
+        data : string/numpy array/scipy.sparse
+            Data source of Dataset.
+            When data is string type, it represents the path of txt file,
+        label : list or numpy 1-D array, optional
+            Label of the training data.
+        weight : list or numpy 1-D array , optional
+            Weight for each instance.
+        group : list or numpy 1-D array , optional
+            group/query size for dataset
+        silent : boolean, optional
+            Whether print messages during construction
+        params: dict, optional
+            other parameters
+        """
+        return Dataset(data, label=label, max_bin=self.max_bin, reference=self,
+                       weight=weight, group=group, predictor=self.predictor,
+                       silent=silent, params=params)
+
+    def subset(self, used_indices, params=None):
+        """
+        Get subset of current dataset
+        """
+        used_indices = list_to_1d_numpy(used_indices, np.int32)
+        ret = Dataset(None)
+        ret.handle = ctypes.c_void_p()
+        params_str = param_dict_to_str(params)
+        _safe_call(_LIB.LGBM_DatasetGetSubset(
+            ctypes.byref(self.handle),
+            used_indices.ctypes.data_as(ctypes.POINTER(ctypes.c_int32)),
+            used_indices.shape[0],
+            c_str(params_str),
+            ctypes.byref(ret.handle)))
+        ret.max_bin = self.max_bin
+        ret.predictor = self.predictor
+        if ret.get_label() is None:
+            raise ValueError("label should not be None")
+        return ret
+
+    def __init_from_np2d(self, mat, params_str, ref_dataset):
+        """
+        Initialize data from a 2-D numpy matrix.
+        """
+        if len(mat.shape) != 2:
+            raise ValueError('Input numpy.ndarray must be 2 dimensional')
+
+        self.handle = ctypes.c_void_p()
+        if mat.dtype == np.float32 or mat.dtype == np.float64:
+            data = np.array(mat.reshape(mat.size), dtype=mat.dtype, copy=False)
+        else:
+            """change non-float data to float data, need to copy"""
+            data = np.array(mat.reshape(mat.size), dtype=np.float32)
+
+        ptr_data, type_ptr_data = c_float_array(data)
+        _safe_call(_LIB.LGBM_DatasetCreateFromMat(
+            ptr_data,
+            type_ptr_data,
+            mat.shape[0],
+            mat.shape[1],
+            C_API_IS_ROW_MAJOR,
+            c_str(params_str),
+            ref_dataset,
+            ctypes.byref(self.handle)))
+
+    def __init_from_csr(self, csr, params_str, ref_dataset):
+        """
+        Initialize data from a CSR matrix.
+        """
+        if len(csr.indices) != len(csr.data):
+            raise ValueError('length mismatch: {} vs {}'.format(len(csr.indices), len(csr.data)))
+        self.handle = ctypes.c_void_p()
+
+        ptr_indptr, type_ptr_indptr = c_int_array(csr.indptr)
+        ptr_data, type_ptr_data = c_float_array(csr.data)
+
+        _safe_call(_LIB.LGBM_DatasetCreateFromCSR(
+            ptr_indptr,
+            type_ptr_indptr,
+            csr.indices.ctypes.data_as(ctypes.POINTER(ctypes.c_int32)),
+            ptr_data,
+            type_ptr_data,
+            len(csr.indptr),
+            len(csr.data),
+            csr.shape[1],
+            c_str(params_str),
+            ref_dataset,
+            ctypes.byref(self.handle)))
+
+    def __del__(self):
+        _safe_call(_LIB.LGBM_DatasetFree(self.handle))
+
+    def get_field(self, field_name):
+        """Get property from the Dataset.
+
+        Parameters
+        ----------
+        field_name: str
+            The field name of the information
+
+        Returns
+        -------
+        info : array
+            a numpy array of information of the data
+        """
+        tmp_out_len = ctypes.c_int64()
+        out_type = ctypes.c_int32()
+        ret = ctypes.POINTER(ctypes.c_void_p)()
+        _safe_call(_LIB.LGBM_DatasetGetField(
+            self.handle,
+            c_str(field_name),
+            ctypes.byref(tmp_out_len),
+            ctypes.byref(ret),
+            ctypes.byref(out_type)))
+        if out_type.value != FIELD_TYPE_MAPPER[field_name]:
+            raise TypeError("Return type error for get_field")
+        if tmp_out_len.value == 0:
+            return None
+        if out_type.value == C_API_DTYPE_INT32:
+            return cint32_array_to_numpy(ctypes.cast(ret, ctypes.POINTER(ctypes.c_int32)), tmp_out_len.value)
+        elif out_type.value == C_API_DTYPE_FLOAT32:
+            return cfloat32_array_to_numpy(ctypes.cast(ret, ctypes.POINTER(ctypes.c_float)), tmp_out_len.value)
+        else:
+            raise TypeError("unknow type")
+
+    def set_field(self, field_name, data):
+        """Set property into the Dataset.
+
+        Parameters
+        ----------
+        field_name: str
+            The field name of the information
+
+        data: numpy array or list or None
+            The array ofdata to be set
+        """
+        if data is None:
+            """set to None"""
+            _safe_call(_LIB.LGBM_DatasetSetField(
+                self.handle,
+                c_str(field_name),
+                None,
+                0,
+                FIELD_TYPE_MAPPER[field_name]))
+            return
+        if not is_numpy_1d_array(data):
+            raise TypeError("Unknow type({})".format(type(data).__name__))
+        if data.dtype == np.float32:
+            ptr_data = data.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
+            type_data = C_API_DTYPE_FLOAT32
+        elif data.dtype == np.int32:
+            ptr_data = data.ctypes.data_as(ctypes.POINTER(ctypes.c_int32))
+            type_data = C_API_DTYPE_INT32
+        else:
+            raise TypeError("excepted np.float32 or np.int32, met type({})".format(data.dtype))
+        if type_data != FIELD_TYPE_MAPPER[field_name]:
+            raise TypeError("type error for set_field")
+        _safe_call(_LIB.LGBM_DatasetSetField(
+            self.handle,
+            c_str(field_name),
+            ptr_data,
+            len(data),
+            type_data))
+
+
+    def save_binary(self, filename):
+        """Save Dataset to binary file
+
+        Parameters
+        ----------
+        filename : string
+            Name of the output file.
+        """
+        _safe_call(_LIB.LGBM_DatasetSaveBinary(
+            self.handle,
+            c_str(filename)))
+
+    def set_label(self, label):
+        """Set label of Dataset
+
+        Parameters
+        ----------
+        label: array like
+            The label information to be set into Dataset
+        """
+        label = list_to_1d_numpy(label, np.float32)
+        self.__label = label
+        self.set_field('label', label)
+
+    def set_weight(self, weight):
+        """ Set weight of each instance.
+
+        Parameters
+        ----------
+        weight : array like
+            Weight for each data point
+        """
+        if weight is not None:
+            weight = list_to_1d_numpy(weight, np.float32)
+        self.__weight = weight
+        self.set_field('weight', weight)
+
+    def set_init_score(self, score):
+        """ Set init score of booster to start from.
+        Parameters
+        ----------
+        score: array like
+
+        """
+        if score is not None:
+            score = list_to_1d_numpy(score, np.float32)
+        self.__init_score = score
+        self.set_field('init_score', score)
+
+    def set_group(self, group):
+        """Set group size of Dataset (used for ranking).
+
+        Parameters
+        ----------
+        group : array like
+            Group size of each group
+        """
+        if group is not None:
+            group = list_to_1d_numpy(group, np.int32)
+        self.__group = group
+        self.set_field('group', group)
+
+
+    def get_label(self):
+        """Get the label of the Dataset.
+
+        Returns
+        -------
+        label : array
+        """
+        if self.__label is None:
+            self.__label = self.get_field('label')
+        if self.__label is None:
+            raise TypeError("label should not be None")
+        return self.__label
+
+    def get_weight(self):
+        """Get the weight of the Dataset.
+
+        Returns
+        -------
+        weight : array
+        """
+        if self.__weight is None:
+            self.__weight = self.get_field('weight')
+        return self.__weight
+
+    def get_init_score(self):
+        """Get the initial score of the Dataset.
+
+        Returns
+        -------
+        init_score : array
+        """
+        if self.__init_score is None:
+            self.__init_score = self.get_field('init_score')
+        return self.__init_score
+
+    def get_group(self):
+        """Get the initial score of the Dataset.
+
+        Returns
+        -------
+        init_score : array
+        """
+        if self.__group is None:
+            self.__group = self.get_field('group')
+        return self.__group
+
+    def num_data(self):
+        """Get the number of rows in the Dataset.
+
+        Returns
+        -------
+        number of rows : int
+        """
+        ret = ctypes.c_int64()
+        _safe_call(_LIB.LGBM_DatasetGetNumData(self.handle,
+                                               ctypes.byref(ret)))
+        return ret.value
+
+    def num_feature(self):
+        """Get the number of columns (features) in the Dataset.
+
+        Returns
+        -------
+        number of columns : int
+        """
+        ret = ctypes.c_int64()
+        _safe_call(_LIB.LGBM_DatasetGetNumFeature(self.handle,
+                                                  ctypes.byref(ret)))
+        return ret.value
+
+class Booster(object):
+    """"A Booster of of LightGBM.
+    """
+    def __init__(self, params=None, train_set=None, model_file=None, silent=False):
+        """Initialize the Booster.
+
+        Parameters
+        ----------
+        params : dict
+            Parameters for boosters.
+        train_set : Dataset
+            training dataset
+        model_file : string
+            Path to the model file.
+        silent : boolean, optional
+            Whether print messages during construction
+        """
+        self.handle = ctypes.c_void_p()
+        self.__need_reload_eval_info = True
+        self.__is_manage_handle = True
+        self.__train_data_name = "training"
+        self.__attr = {}
+        self.best_iteration = -1
+        params = {} if params is None else params
+        if silent:
+            params["verbose"] = 0
+        elif "verbose" not in params:
+            params["verbose"] = 1
+        if train_set is not None:
+            """Training task"""
+            if not isinstance(train_set, Dataset):
+                raise TypeError('training data should be Dataset instance, met{}'.format(type(train_set).__name__))
+            params_str = param_dict_to_str(params)
+            """construct booster object"""
+            _safe_call(_LIB.LGBM_BoosterCreate(
+                train_set.handle,
+                c_str(params_str),
+                ctypes.byref(self.handle)))
+            """save reference to data"""
+            self.train_set = train_set
+            self.valid_sets = []
+            self.name_valid_sets = []
+            self.__num_dataset = 1
+            self.init_predictor = train_set.predictor
+            if self.init_predictor is not None:
+                _safe_call(_LIB.LGBM_BoosterMerge(
+                    self.handle,
+                    self.init_predictor.handle))
+            out_num_class = ctypes.c_int64(0)
+            _safe_call(_LIB.LGBM_BoosterGetNumClasses(
+                self.handle,
+                ctypes.byref(out_num_class)))
+            self.__num_class = out_num_class.value
+            """buffer for inner predict"""
+            self.__inner_predict_buffer = [None]
+            self.__is_predicted_cur_iter = [False]
+            self.__get_eval_info()
+        elif model_file is not None:
+            """Prediction task"""
+            out_num_iterations = ctypes.c_int64(0)
+            _safe_call(_LIB.LGBM_BoosterCreateFromModelfile(
+                c_str(model_file),
+                ctypes.byref(out_num_iterations),
+                ctypes.byref(self.handle)))
+            out_num_class = ctypes.c_int64(0)
+            _safe_call(_LIB.LGBM_BoosterGetNumClasses(
+                self.handle,
+                ctypes.byref(out_num_class)))
+            self.__num_class = out_num_class.value
+        else:
+            raise TypeError('At least need training dataset or model file to create booster instance')
+
+    def __del__(self):
+        if self.handle is not None and self.__is_manage_handle:
+            _safe_call(_LIB.LGBM_BoosterFree(self.handle))
+
+    def set_train_data_name(self, name):
+        self.__train_data_name = name
+
+    def add_valid(self, data, name):
+        """Add an validation data
+
+        Parameters
+        ----------
+        data : Dataset
+            validation data
+        name : String
+            name of validation data
+        """
+        if data.predictor is not self.init_predictor:
+            raise Exception("Add validation data failed, you should use same predictor for these data")
+        _safe_call(_LIB.LGBM_BoosterAddValidData(
+            self.handle,
+            data.handle))
+        self.valid_sets.append(data)
+        self.name_valid_sets.append(name)
+        self.__num_dataset += 1
+        self.__inner_predict_buffer.append(None)
+        self.__is_predicted_cur_iter.append(False)
+
+    def reset_parameter(self, params):
+        """Reset parameters for booster
+
+        Parameters
+        ----------
+        params : dict
+            params
+        silent : boolean, optional
+            Whether print messages during construction
+        """
+        if 'metric' in params:
+            self.__need_reload_eval_info = True
+        params_str = param_dict_to_str(params)
+        if params_str:
+            _safe_call(_LIB.LGBM_BoosterResetParameter(
+                self.handle,
+                c_str(params_str)))
+
+    def update(self, train_set=None, fobj=None):
+        """
+        Update for one iteration
+        Note: for multi-class task, the score is group by class_id first, then group by row_id
+              if you want to get i-th row score in j-th class, the access way is score[j*num_data+i]
+              and you should group grad and hess in this way as well
+        Parameters
+        ----------
+        train_set : training data, None means use last training data
+        fobj : function
+            Customized objective function.
+
+        Returns
+        -------
+        is_finished, bool
+        """
+
+        """need reset training data"""
+        if train_set is not None and train_set is not self.train_set:
+            if train_set.predictor is not self.init_predictor:
+                raise Exception("Replace training data failed, you should use same predictor for these data")
+            self.train_set = train_set
+            _safe_call(_LIB.LGBM_BoosterResetTrainingData(
+                self.handle,
+                self.train_set.handle))
+            self.__inner_predict_buffer[0] = None
+        is_finished = ctypes.c_int(0)
+        if fobj is None:
+            _safe_call(_LIB.LGBM_BoosterUpdateOneIter(
+                self.handle,
+                ctypes.byref(is_finished)))
+            self.__is_predicted_cur_iter = [False for _ in range(self.__num_dataset)]
+            return is_finished.value == 1
+        else:
+            grad, hess = fobj(self.__inner_predict(0), self.train_set)
+            return self.__boost(grad, hess)
+
+    def __boost(self, grad, hess):
+        """
+        Boost the booster for one iteration, with customized gradient statistics.
+        Note: for multi-class task, the score is group by class_id first, then group by row_id
+              if you want to get i-th row score in j-th class, the access way is score[j*num_data+i]
+              and you should group grad and hess in this way as well
+        Parameters
+        ----------
+        grad : 1d numpy or 1d list
+            The first order of gradient.
+        hess : 1d numpy or 1d list
+            The second order of gradient.
+
+        Returns
+        -------
+        is_finished, bool
+        """
+        if not is_numpy_1d_array(grad):
+            if is_1d_list(grad):
+                grad = np.array(grad, dtype=np.float32, copy=False)
+            else:
+                raise TypeError("grad should be numpy 1d array or 1d list")
+        if not is_numpy_1d_array(hess):
+            if is_1d_list(hess):
+                hess = np.array(hess, dtype=np.float32, copy=False)
+            else:
+                raise TypeError("hess should be numpy 1d array or 1d list")
+        if len(grad) != len(hess):
+            raise ValueError('grad / hess length mismatch: {} / {}'.format(len(grad), len(hess)))
+        if grad.dtype != np.float32:
+            grad = grad.astype(np.float32, copy=False)
+        if hess.dtype != np.float32:
+            hess = hess.astype(np.float32, copy=False)
+        is_finished = ctypes.c_int(0)
+        _safe_call(_LIB.LGBM_BoosterUpdateOneIterCustom(
+            self.handle,
+            grad.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
+            hess.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
+            ctypes.byref(is_finished)))
+        self.__is_predicted_cur_iter = [False for _ in range(self.__num_dataset)]
+        return is_finished.value == 1
+
+    def rollback_one_iter(self):
+        """
+        Rollback one iteration
+        """
+        _safe_call(_LIB.LGBM_BoosterRollbackOneIter(
+            self.handle))
+        self.__is_predicted_cur_iter = [False for _ in range(self.__num_dataset)]
+
+    def current_iteration(self):
+        out_cur_iter = ctypes.c_int64(0)
+        _safe_call(_LIB.LGBM_BoosterGetCurrentIteration(
+            self.handle,
+            ctypes.byref(out_cur_iter)))
+        return out_cur_iter.value
+
+    def eval(self, data, name, feval=None):
+        """Evaluate for data
+
+        Parameters
+        ----------
+        data : Dataset object
+        name : name of data
+        feval : function
+            Custom evaluation function.
+        Returns
+        -------
+        result: list
+            Evaluation result list.
+        """
+        if not isinstance(data, Dataset):
+            raise TypeError("Can only eval for Dataset instance")
+        data_idx = -1
+        if data is self.train_set:
+            data_idx = 0
+        else:
+            for i in range(len(self.valid_sets)):
+                if data is self.valid_sets[i]:
+                    data_idx = i + 1
+                    break
+        """need to push new valid data"""
+        if data_idx == -1:
+            self.add_valid(data, name)
+            data_idx = self.__num_dataset - 1
+
+        return self.__inner_eval(name, data_idx, feval)
+
+    def eval_train(self, feval=None):
+        """Evaluate for training data
+
+        Parameters
+        ----------
+        feval : function
+            Custom evaluation function.
+
+        Returns
+        -------
+        result: str
+            Evaluation result list.
+        """
+        return self.__inner_eval(self.__train_data_name, 0, feval)
+
+    def eval_valid(self, feval=None):
+        """Evaluate for validation data
+
+        Parameters
+        ----------
+        feval : function
+            Custom evaluation function.
+
+        Returns
+        -------
+        result: str
+            Evaluation result list.
+        """
+        ret = []
+        for i in range(1, self.__num_dataset):
+            ret.extend(self.__inner_eval(self.name_valid_sets[i-1], i, feval))
+        return ret
+
+    def save_model(self, filename, num_iteration=-1):
+        """Save model of booster to file
+
+        Parameters
+        ----------
+        filename : str
+            filename to save
+        num_iteration: int
+            number of iteration that want to save. < 0 means save all
+        """
+        _safe_call(_LIB.LGBM_BoosterSaveModel(
+            self.handle,
+            num_iteration,
+            c_str(filename)))
+
+    def dump_model(self):
+        """
+        Dump model to json format
+
+        Returns
+        -------
+        Json format of model
+        """
+        buffer_len = 1 << 20
+        tmp_out_len = ctypes.c_int64(0)
+        string_buffer = ctypes.create_string_buffer(buffer_len)
+        ptr_string_buffer = ctypes.c_char_p(*[ctypes.addressof(string_buffer)])
+        _safe_call(_LIB.LGBM_BoosterDumpModel(
+            self.handle,
+            buffer_len,
+            ctypes.byref(tmp_out_len),
+            ctypes.byref(ptr_string_buffer)))
+        actual_len = tmp_out_len.value
+        if actual_len > buffer_len:
+            string_buffer = ctypes.create_string_buffer(actual_len)
+            ptr_string_buffer = ctypes.c_char_p(*[ctypes.addressof(string_buffer)])
+            _safe_call(_LIB.LGBM_BoosterDumpModel(
+                self.handle,
+                actual_len,
+                ctypes.byref(tmp_out_len),
+                ctypes.byref(ptr_string_buffer)))
+        return json.loads(string_buffer.value.decode())
+
+    def predict(self, data, num_iteration=-1, raw_score=False, pred_leaf=False, data_has_header=False, is_reshape=True):
+        """
+        Predict logic
+
+        Parameters
+        ----------
+        data : string/numpy array/scipy.sparse
+            Data source for prediction
+            When data is string type, it represents the path of txt file,
+        num_iteration : int
+            used iteration for prediction
+        raw_score : bool
+            True for predict raw score
+        pred_leaf : bool
+            True for predict leaf index
+        data_has_header : bool
+            Used for txt data
+        is_reshape : bool
+            True for reshape to [nrow, ...]
+
+        Returns
+        -------
+        Prediction result
+        """
+        predictor = Predictor(booster_handle=self.handle, is_manage_handle=False)
+        return predictor.predict(data, num_iteration, raw_score, pred_leaf, data_has_header, is_reshape)
+
+    def to_predictor(self):
+        """Convert to predictor
+        Note: Predictor will manage the handle after doing this
+        """
+        predictor = Predictor(booster_handle=self.handle, is_manage_handle=True)
+        self.__is_manage_handle = False
+        return predictor
+
+    def __inner_eval(self, data_name, data_idx, feval=None):
+        """
+        Evaulate training  or validation data
+        """
+        if data_idx >= self.__num_dataset:
+            raise ValueError("data_idx should be smaller than number of dataset")
+        self.__get_eval_info()
+        ret = []
+        if self.__num_inner_eval > 0:
+            result = np.array([0.0 for _ in range(self.__num_inner_eval)], dtype=np.float32)
+            tmp_out_len = ctypes.c_int64(0)
+            _safe_call(_LIB.LGBM_BoosterGetEval(
+                self.handle,
+                data_idx,
+                ctypes.byref(tmp_out_len),
+                result.ctypes.data_as(ctypes.POINTER(ctypes.c_float))))
+            if tmp_out_len.value != self.__num_inner_eval:
+                raise ValueError("incorrect number of eval results")
+            for i in range(self.__num_inner_eval):
+                ret.append((data_name, self.__name_inner_eval[i], result[i], self.__higher_better_inner_eval[i]))
+        if feval is not None:
+            if data_idx == 0:
+                cur_data = self.train_set
+            else:
+                cur_data = self.valid_sets[data_idx - 1]
+            feval_ret = feval(self.__inner_predict(data_idx), cur_data)
+            if isinstance(feval_ret, list):
+                for eval_name, val, is_higher_better in feval_ret:
+                    ret.append((data_name, eval_name, val, is_higher_better))
+            else:
+                eval_name, val, is_higher_better = feval_ret
+                ret.append((data_name, eval_name, val, is_higher_better))
+        return ret
+
+    def __inner_predict(self, data_idx):
+        """
+        Predict for training and validation dataset
+        """
+        if data_idx >= self.__num_dataset:
+            raise ValueError("data_idx should be smaller than number of dataset")
+        if self.__inner_predict_buffer[data_idx] is None:
+            if data_idx == 0:
+                n_preds = self.train_set.num_data() * self.__num_class
+            else:
+                n_preds = self.valid_sets[data_idx - 1].num_data() * self.__num_class
+            self.__inner_predict_buffer[data_idx] = \
+                np.array([0.0 for _ in range(n_preds)], dtype=np.float32, copy=False)
+        """avoid to predict many time in one iteration"""
+        if not self.__is_predicted_cur_iter[data_idx]:
+            tmp_out_len = ctypes.c_int64(0)
+            data_ptr = self.__inner_predict_buffer[data_idx].ctypes.data_as(ctypes.POINTER(ctypes.c_float))
+            _safe_call(_LIB.LGBM_BoosterGetPredict(
+                self.handle,
+                data_idx,
+                ctypes.byref(tmp_out_len),
+                data_ptr))
+            if tmp_out_len.value != len(self.__inner_predict_buffer[data_idx]):
+                raise ValueError("incorrect number of predict results for data %d" % (data_idx))
+            self.__is_predicted_cur_iter[data_idx] = True
+        return self.__inner_predict_buffer[data_idx]
+
+    def __get_eval_info(self):
+        """
+        Get inner evaluation count and names
+        """
+        if self.__need_reload_eval_info:
+            self.__need_reload_eval_info = False
+            out_num_eval = ctypes.c_int64(0)
+            """Get num of inner evals"""
+            _safe_call(_LIB.LGBM_BoosterGetEvalCounts(
+                self.handle,
+                ctypes.byref(out_num_eval)))
+            self.__num_inner_eval = out_num_eval.value
+            if self.__num_inner_eval > 0:
+                """Get name of evals"""
+                tmp_out_len = ctypes.c_int64(0)
+                string_buffers = [ctypes.create_string_buffer(255) for i in range(self.__num_inner_eval)]
+                ptr_string_buffers = (ctypes.c_char_p*self.__num_inner_eval)(*map(ctypes.addressof, string_buffers))
+                _safe_call(_LIB.LGBM_BoosterGetEvalNames(
+                    self.handle,
+                    ctypes.byref(tmp_out_len),
+                    ptr_string_buffers))
+                if self.__num_inner_eval != tmp_out_len.value:
+                    raise ValueError("size of eval names doesn't equal with num_evals")
+                self.__name_inner_eval = []
+                for i in range(self.__num_inner_eval):
+                    self.__name_inner_eval.append(string_buffers[i].value.decode())
+                self.__higher_better_inner_eval = []
+                higher_better_metric = ['auc', 'ndcg']
+                for name in self.__name_inner_eval:
+                    if any(name.startswith(x) for x in higher_better_metric):
+                        self.__higher_better_inner_eval.append(True)
+                    else:
+                        self.__higher_better_inner_eval.append(False)
+    def attr(self, key):
+        """Get attribute string from the Booster.
+
+        Parameters
+        ----------
+        key : str
+            The key to get attribute from.
+
+        Returns
+        -------
+        value : str
+            The attribute value of the key, returns None if attribute do not exist.
+        """
+        if key in self.__attr:
+            return self.__attr[key]
+        else:
+            return None
+
+    def set_attr(self, **kwargs):
+        """Set the attribute of the Booster.
+
+        Parameters
+        ----------
+        **kwargs
+            The attributes to set. Setting a value to None deletes an attribute.
+        """
+        for key, value in kwargs.items():
+            if value is not None:
+                if not is_str(value):
+                    raise ValueError("Set Attr only accepts string values")
+                self.__attr[key] = value
+            else:
+                self.__attr.pop(key, None)
--- a/python-package/lightgbm/callback.py
+++ b/python-package/lightgbm/callback.py
+# coding: utf-8
+# pylint: disable = invalid-name, W0105
+from __future__ import absolute_import
+import collections
+
+class EarlyStopException(Exception):
+    """Exception of early stopping.
+    Parameters
+    ----------
+    best_iteration : int
+        The best iteration stopped.
+    """
+    def __init__(self, best_iteration):
+        super(EarlyStopException, self).__init__()
+        self.best_iteration = best_iteration
+
+# Callback environment used by callbacks
+CallbackEnv = collections.namedtuple(
+    "LightGBMCallbackEnv",
+    ["model",
+     "cvfolds",
+     "iteration",
+     "begin_iteration",
+     "end_iteration",
+     "evaluation_result_list"])
+
+def _format_eval_result(value, show_stdv=True):
+    """format metric string"""
+    if len(value) == 4:
+        return '%s\'s %s:%g' % (value[0], value[1], value[2])
+    elif len(value) == 5:
+        if show_stdv:
+            return '%s\'s %s:%g+%g' % (value[0], value[1], value[2], value[4])
+        else:
+            return '%s\'s %s:%g' % (value[0], value[1], value[2])
+    else:
+        raise ValueError("wrong metric value")
+
+
+def print_evaluation(period=1, show_stdv=True):
+    """Create a callback that print evaluation result.
+
+    Parameters
+    ----------
+    period : int
+        The period to log the evaluation results
+
+    show_stdv : bool, optional
+         Whether show stdv if provided
+
+    Returns
+    -------
+    callback : function
+        A callback that print evaluation every period iterations.
+    """
+    def callback(env):
+        """internal function"""
+        if len(env.evaluation_result_list) == 0 or period is False:
+            return
+        if env.iteration % period == 0 or env.iteration + 1 == env.begin_iteration:
+            result = '\t'.join([_format_eval_result(x, show_stdv) \
+                for x in env.evaluation_result_list])
+            print('[%d]\t%s' % (env.iteration, result))
+    return callback
+
+
+def record_evaluation(eval_result):
+    """Create a call back that records the evaluation history into eval_result.
+
+    Parameters
+    ----------
+    eval_result : dict
+       A dictionary to store the evaluation results.
+
+    Returns
+    -------
+    callback : function
+        The requested callback function.
+    """
+    if not isinstance(eval_result, dict):
+        raise TypeError('eval_result has to be a dictionary')
+    eval_result.clear()
+
+    def init(env):
+        """internal function"""
+        for data_name, eval_name, _, _ in env.evaluation_result_list:
+            if data_name not in eval_result:
+                eval_result[data_name] = {}
+            if eval_name not in eval_result[data_name]:
+                eval_result[data_name][eval_name] = []
+
+    def callback(env):
+        """internal function"""
+        if len(eval_result) == 0:
+            init(env)
+        for data_name, eval_name, result, _ in env.evaluation_result_list:
+            eval_result[data_name][eval_name].append(result)
+    return callback
+
+
+def reset_learning_rate(learning_rates):
+    """Reset learning rate after iteration 1
+
+    NOTE: the initial learning rate will still take in-effect on first iteration.
+
+    Parameters
+    ----------
+    learning_rates: list or function
+        List of learning rate for each boosting round
+        or a customized function that calculates learning_rate in terms of
+        current number of round and the total number of boosting round (e.g. yields
+        learning rate decay)
+        - list l: learning_rate = l[current_round]
+        - function f: learning_rate = f(current_round, total_boost_round)
+
+    Returns
+    -------
+    callback : function
+        The requested callback function.
+    """
+    def callback(env):
+        """internal function"""
+        booster = env.model
+        i = env.iteration
+        if isinstance(learning_rates, list):
+            if len(learning_rates) != env.end_iteration:
+                raise ValueError("Length of list 'learning_rates' has to equal 'num_boost_round'.")
+            booster.reset_parameter({'learning_rate':learning_rates[i]})
+        else:
+            booster.reset_parameter({'learning_rate':learning_rates(i, env.end_iteration)})
+    callback.before_iteration = True
+    return callback
+
+
+def early_stop(stopping_rounds, verbose=True):
+    """Create a callback that activates early stopping.
+    Activates early stopping.
+    Requires at least one validation data and one metric
+    If there's more than one, will check all of them
+
+    Parameters
+    ----------
+    stopping_rounds : int
+       The stopping rounds before the trend occur.
+
+    verbose : optional, bool
+        Whether to print message about early stopping information.
+
+    Returns
+    -------
+    callback : function
+        The requested callback function.
+    """
+    factor_to_bigger_better = {}
+    best_score = {}
+    best_iter = {}
+    best_msg = {}
+    def init(env):
+        """internal function"""
+        if len(env.evaluation_result_list) == 0:
+            raise ValueError('For early stopping you need at least one set in evals.')
+
+        if verbose:
+            msg = "Train until valid scores didn't improve in {} rounds."
+            print(msg.format(stopping_rounds))
+
+        for i in range(len(env.evaluation_result_list)):
+            best_score[i] = float('-inf')
+            best_iter[i] = 0
+            if verbose:
+                best_msg[i] = ""
+            factor_to_bigger_better[i] = -1.0
+            if env.evaluation_result_list[i][3]:
+                factor_to_bigger_better[i] = 1.0
+
+    def callback(env):
+        """internal function"""
+        if len(best_score) == 0:
+            init(env)
+        for i in range(len(env.evaluation_result_list)):
+            score = env.evaluation_result_list[i][2] * factor_to_bigger_better[i]
+            if score > best_score[i]:
+                best_score[i] = score
+                best_iter[i] = env.iteration
+                if verbose:
+                    best_msg[i] = '[%d]\t%s' % (env.iteration, \
+                        '\t'.join([_format_eval_result(x) for x in env.evaluation_result_list]))
+            else:
+                if env.iteration - best_iter[i] >= stopping_rounds:
+                    if env.model is not None:
+                        env.model.set_attr(best_iteration=str(best_iter[i]))
+                    if verbose:
+                        print('early stopping, best iteration is:\n{}'.format(best_msg[i]))
+                    raise EarlyStopException(best_iter[i])
+    return callback
--- a/python-package/lightgbm/engine.py
+++ b/python-package/lightgbm/engine.py
+# coding: utf-8
+# pylint: disable = invalid-name, W0105
+"""Training Library containing training routines of LightGBM."""
+from __future__ import absolute_import
+
+import numpy as np
+from .basic import LightGBMError, Predictor, Dataset, Booster, is_str
+from . import callback
+
+def _construct_dataset(X_y, reference=None,
+                       params=None, other_fields=None,
+                       predictor=None):
+    if 'max_bin' in params:
+        max_bin = int(params['max_bin'])
+    else:
+        max_bin = 255
+    weight = None
+    group = None
+    init_score = None
+    if other_fields is not None:
+        if not isinstance(other_fields, dict):
+            raise TypeError("other filed data should be dict type")
+        weight = None if 'weight' not in other_fields else other_fields['weight']
+        group = None if 'group' not in other_fields else other_fields['group']
+        init_score = None if 'init_score' not in other_fields else other_fields['init_score']
+    if is_str(X_y):
+        data = X_y
+        label = None
+    else:
+        if len(X_y) != 2:
+            raise TypeError("should pass (data, label) pair")
+        data = X_y[0]
+        label = X_y[1]
+    if reference is None:
+        ret = Dataset(data, label=label, max_bin=max_bin,
+                      weight=weight, group=group,
+                      predictor=predictor, params=params)
+    else:
+        ret = reference.create_valid(data, label=label, weight=weight,
+                                     group=group, params=params)
+    if init_score is not None:
+        ret.set_init_score(init_score)
+    return ret
+
+def train(params, train_data, num_boost_round=100,
+          valid_datas=None, valid_names=None,
+          fobj=None, feval=None, init_model=None,
+          train_fields=None, valid_fields=None,
+          early_stopping_rounds=None, evals_result=None,
+          verbose_eval=True, learning_rates=None, callbacks=None):
+    """Train with given parameters.
+
+    Parameters
+    ----------
+    params : dict
+         params.
+    train_data : Dataset, tuple (X, y) or filename of data
+        Data to be trained.
+    num_boost_round: int
+        Number of boosting iterations.
+    valid_datas: list of Datasets, tuples (valid_X, valid_y) or filename of data
+        List of data to be evaluated during training
+    valid_names: list of string
+        names of valid_datas
+    fobj : function
+        Customized objective function.
+    feval : function
+        Customized evaluation function.
+        Note: should return (eval_name, eval_result, is_higher_better) of list of this
+    init_model : file name of lightgbm model or 'Booster' instance
+        model used for continued train
+    train_fields : dict
+        other data file in training data. e.g. train_fields['weight'] is weight data
+        support fields: weight, group, init_score
+    valid_fields : dict
+        other data file in training data. \
+        e.g. valid_fields[0]['weight'] is weight data for first valid data
+        support fields: weight, group, init_score
+    early_stopping_rounds: int
+        Activates early stopping.
+        Requires at least one validation data and one metric
+        If there's more than one, will check all of them
+        Returns the model with (best_iter + early_stopping_rounds)
+        If early stopping occurs, the model will add 'best_iteration' field
+    evals_result: dict or None
+        This dictionary used to store all evaluation results of all the items in valid_datas.
+        Example: with a valid_datas containing [valid_set, train_set] \
+        and valid_names containing ['eval', 'train'] and a paramater containing ('metric':'logloss')
+        Returns: {'train': {'logloss': ['0.48253', '0.35953', ...]},
+                  'eval': {'logloss': ['0.480385', '0.357756', ...]}}
+        passed with None means no using this function
+    verbose_eval : bool or int
+        Requires at least one item in evals.
+        If `verbose_eval` is True then the evaluation metric on the validation set is
+        printed at each boosting stage.
+        If `verbose_eval` is an integer then the evaluation metric on the validation set
+        is printed at every given `verbose_eval` boosting stage. The last boosting stage
+        / the boosting stage found by using `early_stopping_rounds` is also printed.
+        Example: with verbose_eval=4 and at least one item in evals, an evaluation metric
+        is printed every 4 boosting stages, instead of every boosting stage.
+    learning_rates: list or function
+        List of learning rate for each boosting round
+        or a customized function that calculates learning_rate in terms of
+        current number of round and the total number of boosting round (e.g. yields
+        learning rate decay)
+        - list l: learning_rate = l[current_round]
+        - function f: learning_rate = f(current_round, total_boost_round)
+    callbacks : list of callback functions
+        List of callback functions that are applied at end of each iteration.
+
+    Returns
+    -------
+    booster : a trained booster model
+    """
+    """create predictor first"""
+    if is_str(init_model):
+        predictor = Predictor(model_file=init_model)
+    elif isinstance(init_model, Booster):
+        predictor = init_model.to_predictor()
+    elif isinstance(init_model, Predictor):
+        predictor = init_model
+    else:
+        predictor = None
+    """create dataset"""
+    if isinstance(train_data, Dataset):
+        train_set = train_data
+    else:
+        train_set = _construct_dataset(train_data, None, params, train_fields, predictor)
+    is_valid_contain_train = False
+    train_data_name = "training"
+    valid_sets = []
+    name_valid_sets = []
+    if valid_datas is not None:
+        if isinstance(valid_datas, (Dataset, tuple)):
+            valid_datas = [valid_datas]
+        if isinstance(valid_names, str):
+            valid_names = [valid_names]
+        for i, valid_data in enumerate(valid_datas):
+            other_fields = None if valid_fields is None else valid_fields[i]
+            """reduce cost for prediction training data"""
+            if valid_data is train_data:
+                is_valid_contain_train = True
+                if valid_names is not None:
+                    train_data_name = valid_names[i]
+                continue
+            if isinstance(valid_data, Dataset):
+                valid_set = valid_data
+            else:
+                valid_set = _construct_dataset(
+                    valid_data,
+                    train_set,
+                    params,
+                    other_fields,
+                    predictor)
+            valid_sets.append(valid_set)
+            if valid_names is not None:
+                name_valid_sets.append(valid_names[i])
+            else:
+                name_valid_sets.append('valid_'+str(i))
+    """process callbacks"""
+    callbacks = [] if callbacks is None else callbacks
+
+    # Most of legacy advanced options becomes callbacks
+    if isinstance(verbose_eval, bool) and verbose_eval:
+        callbacks.append(callback.print_evaluation())
+    else:
+        if isinstance(verbose_eval, int):
+            callbacks.append(callback.print_evaluation(verbose_eval))
+
+    if early_stopping_rounds is not None:
+        callbacks.append(callback.early_stop(early_stopping_rounds,
+                                             verbose=bool(verbose_eval)))
+    if learning_rates is not None:
+        callbacks.append(callback.reset_learning_rate(learning_rates))
+
+    if evals_result is not None:
+        callbacks.append(callback.record_evaluation(evals_result))
+
+    callbacks_before_iter = [
+        cb for cb in callbacks if cb.__dict__.get('before_iteration', False)]
+    callbacks_after_iter = [
+        cb for cb in callbacks if not cb.__dict__.get('before_iteration', False)]
+    """construct booster"""
+    if 'metric' in params:
+        if is_str(params['metric']):
+            params['metric'] = params['metric'].split(',')
+        else:
+            params['metric'] = list(params['metric'])
+
+    booster = Booster(params=params, train_set=train_set)
+    if is_valid_contain_train:
+        booster.set_train_data_name(train_data_name)
+    for valid_set, name_valid_set in zip(valid_sets, name_valid_sets):
+        booster.add_valid(valid_set, name_valid_set)
+    """start training"""
+    for i in range(num_boost_round):
+        for cb in callbacks_before_iter:
+            cb(callback.CallbackEnv(model=booster,
+                                    cvfolds=None,
+                                    iteration=i,
+                                    begin_iteration=0,
+                                    end_iteration=num_boost_round,
+                                    evaluation_result_list=None))
+
+        booster.update(fobj=fobj)
+
+        evaluation_result_list = []
+        # check evaluation result.
+        if len(valid_sets) != 0:
+            if is_valid_contain_train:
+                evaluation_result_list.extend(booster.eval_train(feval))
+            evaluation_result_list.extend(booster.eval_valid(feval))
+        try:
+            for cb in callbacks_after_iter:
+                cb(callback.CallbackEnv(model=booster,
+                                        cvfolds=None,
+                                        iteration=i,
+                                        begin_iteration=0,
+                                        end_iteration=num_boost_round,
+                                        evaluation_result_list=evaluation_result_list))
+        except callback.EarlyStopException:
+            break
+    if booster.attr('best_iteration') is not None:
+        booster.best_iteration = int(booster.attr('best_iteration')) + 1
+    else:
+        booster.best_iteration = num_boost_round
+    return booster
+
+
+class CVBooster(object):
+    """"Auxiliary datastruct to hold one fold of CV."""
+    def __init__(self, train_set, valid_test, params):
+        """"Initialize the CVBooster"""
+        self.train_set = train_set
+        self.valid_test = valid_test
+        self.booster = Booster(params=params, train_set=train_set)
+        self.booster.add_valid(valid_test, 'valid')
+
+    def update(self, fobj):
+        """"Update the boosters for one iteration"""
+        self.booster.update(fobj=fobj)
+
+    def eval(self, feval):
+        """"Evaluate the CVBooster for one iteration."""
+        return self.booster.eval_valid(feval)
+
+try:
+    try:
+        from sklearn.model_selection import StratifiedKFold
+    except ImportError:
+        from sklearn.cross_validation import StratifiedKFold
+    SKLEARN_StratifiedKFold = True
+except ImportError:
+    SKLEARN_StratifiedKFold = False
+
+def _make_n_folds(full_data, nfold, param, seed, fpreproc=None, stratified=False):
+    """
+    Make an n-fold list of CVBooster from random indices.
+    """
+    np.random.seed(seed)
+    if stratified:
+        if SKLEARN_StratifiedKFold:
+            sfk = StratifiedKFold(n_splits=nfold, shuffle=True, random_state=seed)
+            idset = [x[1] for x in sfk.split(X=full_data.get_label(), y=full_data.get_label())]
+        else:
+            raise LightGBMError('sklearn needs to be installed in order to use stratified cv')
+    else:
+        randidx = np.random.permutation(full_data.num_data())
+        kstep = int(len(randidx) / nfold)
+        idset = [randidx[(i * kstep): min(len(randidx), (i + 1) * kstep)] for i in range(nfold)]
+
+    ret = []
+    for k in range(nfold):
+        train_set = full_data.subset(np.concatenate([idset[i] for i in range(nfold) if k != i]))
+        valid_set = full_data.subset(idset[k])
+        # run preprocessing on the data set if needed
+        if fpreproc is not None:
+            train_set, valid_set, tparam = fpreproc(train_set, valid_set, param.copy())
+        else:
+            tparam = param
+        ret.append(CVBooster(train_set, valid_set, tparam))
+    return ret
+
+def _agg_cv_result(raw_results):
+    """
+    Aggregate cross-validation results.
+    """
+    cvmap = {}
+    metric_type = {}
+    for one_result in raw_results:
+        for one_line in one_result:
+            key = one_line[1]
+            metric_type[key] = one_line[3]
+            if key not in cvmap:
+                cvmap[key] = []
+            cvmap[key].append(one_line[2])
+    results = []
+    for k, v in cvmap.items():
+        v = np.array(v)
+        mean, std = np.mean(v), np.std(v)
+        results.append(('cv_agg', k, mean, metric_type[k], std))
+    return results
+
+def cv(params, train_data, num_boost_round=10, nfold=5, stratified=False,
+       metrics=(), fobj=None, feval=None, train_fields=None, early_stopping_rounds=None,
+       fpreproc=None, verbose_eval=None, show_stdv=True, seed=0,
+       callbacks=None):
+    """Cross-validation with given paramaters.
+
+    Parameters
+    ----------
+    params : dict
+        Booster params.
+    train_data : pair, (X, y) or filename of data
+        Data to be trained.
+    num_boost_round : int
+        Number of boosting iterations.
+    nfold : int
+        Number of folds in CV.
+    stratified : bool
+        Perform stratified sampling.
+    folds : a KFold or StratifiedKFold instance
+        Sklearn KFolds or StratifiedKFolds.
+    metrics : string or list of strings
+        Evaluation metrics to be watched in CV.
+    fobj : function
+        Custom objective function.
+    feval : function
+        Custom evaluation function.
+    train_fields : dict
+        other data file in training data. e.g. train_fields['weight'] is weight data
+        support fields: weight, group, init_score
+    early_stopping_rounds: int
+        Activates early stopping. CV error needs to decrease at least
+        every <early_stopping_rounds> round(s) to continue.
+        Last entry in evaluation history is the one from best iteration.
+    fpreproc : function
+        Preprocessing function that takes (dtrain, dtest, param) and returns
+        transformed versions of those.
+    verbose_eval : bool, int, or None, default None
+        Whether to display the progress. If None, progress will be displayed
+        when np.ndarray is returned. If True, progress will be displayed at
+        boosting stage. If an integer is given, progress will be displayed
+        at every given `verbose_eval` boosting stage.
+    show_stdv : bool, default True
+        Whether to display the standard deviation in progress.
+        Results are not affected, and always contains std.
+    seed : int
+        Seed used to generate the folds (passed to numpy.random.seed).
+    callbacks : list of callback functions
+        List of callback functions that are applied at end of each iteration.
+
+    Returns
+    -------
+    evaluation history : list(string)
+    """
+
+    if isinstance(metrics, str):
+        metrics = [metrics]
+
+    if isinstance(params, list):
+        params = dict(params)
+
+    if 'metric' not in params:
+        params['metric'] = []
+    else:
+        if is_str(params['metric']):
+            params['metric'] = params['metric'].split(',')
+        else:
+            params['metric'] = list(params['metric'])
+
+    if metrics is not None and len(metrics) > 0:
+        params['metric'].extend(metrics)
+
+    train_set = _construct_dataset(train_data, None, params, train_fields)
+
+    results = {}
+    cvfolds = _make_n_folds(train_set, nfold, params, seed, fpreproc, stratified)
+
+    # setup callbacks
+    callbacks = [] if callbacks is None else callbacks
+    if early_stopping_rounds is not None:
+        callbacks.append(callback.early_stop(early_stopping_rounds,
+                                             verbose=False))
+    if isinstance(verbose_eval, bool) and verbose_eval:
+        callbacks.append(callback.print_evaluation(show_stdv=show_stdv))
+    else:
+        if isinstance(verbose_eval, int):
+            callbacks.append(callback.print_evaluation(verbose_eval, show_stdv=show_stdv))
+
+    callbacks_before_iter = [
+        cb for cb in callbacks if cb.__dict__.get('before_iteration', False)]
+    callbacks_after_iter = [
+        cb for cb in callbacks if not cb.__dict__.get('before_iteration', False)]
+
+    for i in range(num_boost_round):
+        for cb in callbacks_before_iter:
+            cb(callback.CallbackEnv(model=None,
+                                    cvfolds=cvfolds,
+                                    iteration=i,
+                                    begin_iteration=0,
+                                    end_iteration=num_boost_round,
+                                    evaluation_result_list=None))
+        for fold in cvfolds:
+            fold.update(fobj)
+        res = _agg_cv_result([f.eval(feval) for f in cvfolds])
+        for _, key, mean, _, std in res:
+            if key + '-mean' not in results:
+                results[key + '-mean'] = []
+            if key + '-std' not in results:
+                results[key + '-std'] = []
+            results[key + '-mean'].append(mean)
+            results[key + '-std'].append(std)
+        try:
+            for cb in callbacks_after_iter:
+                cb(callback.CallbackEnv(model=None,
+                                        cvfolds=cvfolds,
+                                        iteration=i,
+                                        begin_iteration=0,
+                                        end_iteration=num_boost_round,
+                                        evaluation_result_list=res))
+        except callback.EarlyStopException as e:
+            for k in results:
+                results[k] = results[k][:(e.best_iteration + 1)]
+            break
+    return results