refine Dataset class (#113)

Provide a high level Dataset class for easy use.

refine Dataset class (#113)
Provide a high level Dataset class for easy use.
b51c7be4 · Guolin Ke · GitHub · f3d33582 · b51c7be4 · b51c7be4
Commit b51c7be4 authored Dec 08, 2016 by Guolin Ke Committed by GitHub Dec 08, 2016
12 changed files
--- a/.travis.yml
+++ b/.travis.yml
@@ -14,7 +14,7 @@ before_install:

 install:
 - sudo apt-get install -y libopenmpi-dev openmpi-bin build-essential 
- conda install --yes atlas numpy scipy scikit-learn
+- conda install --yes atlas numpy scipy scikit-learn pandas


 script:
@@ -22,12 +22,12 @@ script:
 - mkdir build && cd build && cmake .. && make -j
 - cd $TRAVIS_BUILD_DIR/tests/c_api_test && python test.py
 - cd $TRAVIS_BUILD_DIR/python-package && python setup.py install
- cd $TRAVIS_BUILD_DIR/tests/python_package_test && python test_basic.py && python test_sklearn.py
+- cd $TRAVIS_BUILD_DIR/tests/python_package_test && python test_basic.py && python test_engine.py && python test_sklearn.py
 - cd $TRAVIS_BUILD_DIR
 - rm -rf build && mkdir build && cd build && cmake -DUSE_MPI=ON ..&& make -j
 - cd $TRAVIS_BUILD_DIR/tests/c_api_test && python test.py 
 - cd $TRAVIS_BUILD_DIR/python-package && python setup.py install
- cd $TRAVIS_BUILD_DIR/tests/python_package_test && python test_basic.py && python test_sklearn.py
+- cd $TRAVIS_BUILD_DIR/tests/python_package_test && python test_basic.py && python test_engine.py && python test_sklearn.py

 notifications:
  email: false

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -76,7 +76,7 @@ add_executable(lightgbm src/main.cpp ${SOURCES})
 add_library(_lightgbm SHARED src/c_api.cpp ${SOURCES})

 if(MSVC)
-    set_target_properties(_lightgbm PROPERTIES OUTPUT_NAME "lightgbm")
+    set_target_properties(_lightgbm PROPERTIES OUTPUT_NAME "lib_lightgbm")
 endif(MSVC)

 if(USE_MPI)

--- a/examples/python-guide/simple_example.py
+++ b/examples/python-guide/simple_example.py
@@ -17,13 +17,7 @@ X_test = df_test.drop(0, axis=1)
 # create dataset for lightgbm
 lgb_train = lgb.Dataset(X_train, y_train)
 lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
-# ATTENTION: you should carefully use lightgbm.Dataset
-# it requires setting up categorical_feature when you init it
-# rather than passing from lightgbm.train
-# instead, you can simply use a tuple of length=2 like below
-# it will help you construct Datasets with parameters in lightgbm.train
-lgb_train = (X_train, y_train)
-lgb_eval = (X_test, y_test)
+

 # specify your configurations as a dict
 params = {
@@ -43,9 +37,7 @@ params = {
 gbm = lgb.train(params,
                lgb_train,
                num_boost_round=100,
-                valid_datas=lgb_eval,
-                # you can use a list to represent multiple valid_datas/valid_names
-                # don't use tuple, tuple is used to represent one dataset
+                valid_sets=lgb_eval,
                early_stopping_rounds=10)

 # save model to file

--- a/include/LightGBM/config.h
+++ b/include/LightGBM/config.h
@@ -230,6 +230,7 @@ struct OverallConfig: public ConfigBase {
 public:
  TaskType task_type = TaskType::kTrain;
  NetworkConfig network_config;
+  int seed = 0;
  int num_threads = 0;
  bool is_parallel = false;
  bool is_parallel_find_bin = false;
@@ -317,6 +318,7 @@ struct ParameterAlias {
    {
      { "config", "config_file" },
      { "nthread", "num_threads" },
+      { "random_seed", "seed" },
      { "num_thread", "num_threads" },
      { "boosting", "boosting_type" },
      { "boost", "boosting_type" },

--- a/python-package/lightgbm/__init__.py
+++ b/python-package/lightgbm/__init__.py
@@ -8,7 +8,7 @@ from __future__ import absolute_import

 import os

-from .basic import Predictor, Dataset, Booster
+from .basic import Dataset, Booster
 from .engine import train, cv
 try:
    from .sklearn import LGBMModel, LGBMRegressor, LGBMClassifier, LGBMRanker

--- a/python-package/lightgbm/basic.py
+++ b/python-package/lightgbm/basic.py
--- a/python-package/lightgbm/engine.py
+++ b/python-package/lightgbm/engine.py
@@ -6,52 +6,12 @@ from __future__ import absolute_import
 import collections
 from operator import attrgetter
 import numpy as np
-from .basic import LightGBMError, Predictor, Dataset, Booster, is_str
+from .basic import LightGBMError, _InnerPredictor, Dataset, Booster, is_str
 from . import callback

-def _construct_dataset(X_y, reference=None,
-                       params=None, other_fields=None,
-                       feature_name=None, categorical_feature=None,
-                       predictor=None):
-    if 'max_bin' in params:
-        max_bin = int(params['max_bin'])
-    else:
-        max_bin = 255
-    weight = None
-    group = None
-    init_score = None
-    if other_fields is not None:
-        if not isinstance(other_fields, dict):
-            raise TypeError("type of other filed data should be dict")
-        weight = other_fields.get('weight', None)
-        group = other_fields.get('group', None)
-        init_score = other_fields.get('init_score', None)
-    if is_str(X_y):
-        data = X_y
-        label = None
-    else:
-        if len(X_y) != 2:
-            raise TypeError("should pass (data, label) tuple for dataset")
-        data = X_y[0]
-        label = X_y[1]
-    if reference is None:
-        ret = Dataset(data, label=label, max_bin=max_bin,
-                      weight=weight, group=group,
-                      predictor=predictor,
-                      feature_name=feature_name,
-                      categorical_feature=categorical_feature,
-                      params=params)
-    else:
-        ret = reference.create_valid(data, label=label, weight=weight,
-                                     group=group, params=params)
-    if init_score is not None:
-        ret.set_init_score(init_score)
-    return ret
-
-def train(params, train_data, num_boost_round=100,
-          valid_datas=None, valid_names=None,
+def train(params, train_set, num_boost_round=100,
+          valid_sets=None, valid_names=None,
          fobj=None, feval=None, init_model=None,
-          train_fields=None, valid_fields=None,
          feature_name=None, categorical_feature=None,
          early_stopping_rounds=None, evals_result=None,
          verbose_eval=True, learning_rates=None, callbacks=None):
@@ -61,14 +21,14 @@ def train(params, train_data, num_boost_round=100,
    ----------
    params : dict
        Parameters for training.
-    train_data : Dataset, tuple (X, y) or filename of data
+    train_set : Dataset
        Data to be trained.
    num_boost_round: int
        Number of boosting iterations.
-    valid_datas: list of Datasets, tuples (valid_X, valid_y) or filenames of data
+    valid_sets: list of Datasets
        List of data to be evaluated during training
    valid_names: list of string
-        Names of valid_datas
+        Names of valid_sets
    fobj : function
        Customized objective function.
    feval : function
@@ -76,13 +36,6 @@ def train(params, train_data, num_boost_round=100,
        Note: should return (eval_name, eval_result, is_higher_better) of list of this
    init_model : file name of lightgbm model or 'Booster' instance
        model used for continued train
-    train_fields : dict
-        Other data file in training data. e.g. train_fields['weight'] is weight data
-        Support fields: weight, group, init_score
-    valid_fields : dict
-        Other data file in training data. \
-        e.g. valid_fields[0]['weight'] is weight data for first valid data
-        Support fields: weight, group, init_score
    feature_name : list of str
        Feature names
    categorical_feature : list of str or int
@@ -95,8 +48,8 @@ def train(params, train_data, num_boost_round=100,
        Returns the model with (best_iter + early_stopping_rounds)
        If early stopping occurs, the model will add 'best_iteration' field
    evals_result: dict or None
-        This dictionary used to store all evaluation results of all the items in valid_datas.
-        Example: with a valid_datas containing [valid_set, train_set] \
+        This dictionary used to store all evaluation results of all the items in valid_sets.
+        Example: with a valid_sets containing [valid_set, train_set] \
        and valid_names containing ['eval', 'train'] and a paramater containing ('metric':'logloss')
        Returns: {'train': {'logloss': ['0.48253', '0.35953', ...]},
                  'eval': {'logloss': ['0.480385', '0.357756', ...]}}
@@ -127,58 +80,40 @@ def train(params, train_data, num_boost_round=100,
    """
    """create predictor first"""
    if is_str(init_model):
-        predictor = Predictor(model_file=init_model)
+        predictor = _InnerPredictor(model_file=init_model)
    elif isinstance(init_model, Booster):
-        predictor = init_model.to_predictor()
-    elif isinstance(init_model, Predictor):
-        predictor = init_model
+        predictor = init_model._to_predictor()
    else:
        predictor = None
    init_iteration = predictor.num_total_iteration if predictor else 0
-    """create dataset"""
-    if isinstance(train_data, Dataset):
-        train_set = train_data
-        if train_fields is not None:
-            for field, data in train_fields.items():
-                train_set.set_field(field, data)
-    else:
-        train_set = _construct_dataset(train_data, None, params,
-                                       other_fields=train_fields,
-                                       feature_name=feature_name,
-                                       categorical_feature=categorical_feature,
-                                       predictor=predictor)
+    """check dataset"""
+    if not isinstance(train_set, Dataset):
+        raise TypeError("only can accept Dataset instance for traninig")
+
+    train_set._set_predictor(predictor)
+    train_set.set_feature_name(feature_name)
+    train_set.set_categorical_feature(categorical_feature)
+
    is_valid_contain_train = False
    train_data_name = "training"
-    valid_sets = []
+    reduced_valid_sets = []
    name_valid_sets = []
-    if valid_datas:
-        if isinstance(valid_datas, (Dataset, tuple)):
-            valid_datas = [valid_datas]
+    if valid_sets:
+        if isinstance(valid_sets, Dataset):
+            valid_sets = [valid_sets]
        if isinstance(valid_names, str):
            valid_names = [valid_names]
-        for i, valid_data in enumerate(valid_datas):
-            other_fields = None if valid_fields is None else valid_fields.get(i, None)
+        for i, valid_data in enumerate(valid_sets):
            """reduce cost for prediction training data"""
-            if valid_data[0] is train_data[0] and valid_data[1] is train_data[1]:
+            if valid_data is train_set:
                is_valid_contain_train = True
                if valid_names is not None:
                    train_data_name = valid_names[i]
                continue
-            if isinstance(valid_data, Dataset):
-                valid_set = valid_data
-                if other_fields is not None:
-                    for field, data in other_fields.items():
-                        valid_set.set_field(field, data)
-            else:
-                valid_set = _construct_dataset(
-                    valid_data,
-                    train_set,
-                    params,
-                    other_fields=other_fields,
-                    feature_name=feature_name,
-                    categorical_feature=categorical_feature,
-                    predictor=predictor)
-            valid_sets.append(valid_set)
+            if not isinstance(valid_data, Dataset):
+                raise TypeError("only can accept Dataset instance for traninig")
+            valid_data.set_reference(train_set)
+            reduced_valid_sets.append(valid_data)
            if valid_names is not None and len(valid_names) > i:
                name_valid_sets.append(valid_names[i])
            else:
@@ -217,7 +152,7 @@ def train(params, train_data, num_boost_round=100,
    booster = Booster(params=params, train_set=train_set)
    if is_valid_contain_train:
        booster.set_train_data_name(train_data_name)
-    for valid_set, name_valid_set in zip(valid_sets, name_valid_sets):
+    for valid_set, name_valid_set in zip(reduced_valid_sets, name_valid_sets):
        booster.add_valid(valid_set, name_valid_set)

    """start training"""
@@ -294,6 +229,7 @@ def _make_n_folds(full_data, nfold, params, seed, fpreproc=None, stratified=Fals
        else:
            raise LightGBMError('sklearn needs to be installed in order to use stratified cv')
    else:
+        full_data.construct()
        randidx = np.random.permutation(full_data.num_data())
        kstep = int(len(randidx) / nfold)
        idset = [randidx[(i * kstep): min(len(randidx), (i + 1) * kstep)] for i in range(nfold)]
@@ -322,8 +258,8 @@ def _agg_cv_result(raw_results):
            cvmap[one_line[1]].append(one_line[2])
    return [('cv_agg', k, np.mean(v), metric_type[k], np.std(v)) for k, v in cvmap.items()]

-def cv(params, train_data, num_boost_round=10, nfold=5, stratified=False,
-       metrics=(), fobj=None, feval=None, train_fields=None,
+def cv(params, train_set, num_boost_round=10, nfold=5, stratified=False,
+       metrics=(), fobj=None, feval=None, init_model=None,
       feature_name=None, categorical_feature=None,
       early_stopping_rounds=None, fpreproc=None,
       verbose_eval=None, show_stdv=True, seed=0,
@@ -334,7 +270,7 @@ def cv(params, train_data, num_boost_round=10, nfold=5, stratified=False,
    ----------
    params : dict
        Booster params.
-    train_data : tuple (X, y) or filename of data
+    train_set : Dataset
        Data to be trained.
    num_boost_round : int
        Number of boosting iterations.
@@ -350,9 +286,8 @@ def cv(params, train_data, num_boost_round=10, nfold=5, stratified=False,
        Custom objective function.
    feval : function
        Custom evaluation function.
-    train_fields : dict
-        Other data file in training data. e.g. train_fields['weight'] is weight data
-        Support fields: weight, group, init_score
+    init_model : file name of lightgbm model or 'Booster' instance
+        model used for continued train
    feature_name : list of str
        Feature names
    categorical_feature : list of str or int
@@ -382,6 +317,20 @@ def cv(params, train_data, num_boost_round=10, nfold=5, stratified=False,
    -------
    evaluation history : list(string)
    """
+    if not isinstance(train_set, Dataset):
+        raise TypeError("only can accept Dataset instance for traninig")
+
+    if is_str(init_model):
+        predictor = _InnerPredictor(model_file=init_model)
+    elif isinstance(init_model, Booster):
+        predictor = init_model._to_predictor()
+    else:
+        predictor = None
+
+    train_set._set_predictor(predictor)
+    train_set.set_feature_name(feature_name)
+    train_set.set_categorical_feature(categorical_feature)
+
    if metrics:
        params.setdefault('metric', [])
        if is_str(metrics):
@@ -389,11 +338,6 @@ def cv(params, train_data, num_boost_round=10, nfold=5, stratified=False,
        else:
            params['metric'].extend(metrics)

-    train_set = _construct_dataset(train_data, None, params,
-								   other_fields=train_fields,
-                                   feature_name=feature_name,
-                                   categorical_feature=categorical_feature)
-
    results = collections.defaultdict(list)
    cvfolds = _make_n_folds(train_set, nfold, params, seed, fpreproc, stratified)


--- a/python-package/lightgbm/libpath.py
+++ b/python-package/lightgbm/libpath.py
@@ -19,6 +19,7 @@ def find_lib_path():
    if os.name == 'nt':
        dll_path.append(os.path.join(curr_path, '../../windows/x64/Dll/'))
        dll_path.append(os.path.join(curr_path, './windows/x64/Dll/'))
+        dll_path.append(os.path.join(curr_path, '../../Release/'))
        dll_path = [os.path.join(p, 'lib_lightgbm.dll') for p in dll_path]
    else:
        dll_path = [os.path.join(p, 'lib_lightgbm.so') for p in dll_path]

--- a/python-package/lightgbm/sklearn.py
+++ b/python-package/lightgbm/sklearn.py
@@ -4,7 +4,7 @@
 from __future__ import absolute_import

 import numpy as np
-from .basic import LightGBMError, is_str
+from .basic import LightGBMError, Dataset, is_str
 from .engine import train
 # sklearn
 try:
@@ -195,9 +195,12 @@ class LGBMModel(LGBMModelBase):
            params.pop('nthread', None)
        return params

-    def fit(self, X, y, eval_set=None, eval_metric=None,
+    def fit(self, X, y,
+            sample_weight=None, init_score=None, group=None, 
+            eval_set=None, eval_sample_weight=None, 
+            eval_init_score=None, eval_group=None,
+            eval_metric=None,
            early_stopping_rounds=None, verbose=True,
-            train_fields=None, valid_fields=None,
            feature_name=None, categorical_feature=None,
            other_params=None):
        """
@@ -209,24 +212,29 @@ class LGBMModel(LGBMModelBase):
            Feature matrix
        y : array_like
            Labels
+        sample_weight : array_like
+            weight of training data
+        init_score : array_like
+            init score of training data
+        group : array_like
+            group data of training data
        eval_set : list, optional
            A list of (X, y) tuple pairs to use as a validation set for early-stopping
+        eval_sample_weight : List of array
+            weight of eval data
+        eval_init_score : List of array
+            init score of eval data
+        eval_group : List of array
+            group data of eval data
        eval_metric : str, list of str, callable, optional
            If a str, should be a built-in evaluation metric to use.
            If callable, a custom evaluation metric. The call \
            signature is func(y_predicted, dataset) where dataset will be a \
-            Dataset fobject such that you may need to call the get_label \
+            Dateset object such that you may need to call the get_label \
            method. And it must return (eval_name->str, eval_result->float, is_bigger_better->Bool)
        early_stopping_rounds : int
        verbose : bool
            If `verbose` and an evaluation set is used, writes the evaluation
-        train_fields : dict
-            Other data file in training data. e.g. train_fields['weight'] is weight data
-            Support fields: weight, group, init_score
-        valid_fields : dict
-            Other data file in training data. \
-            e.g. valid_fields[0]['weight'] is weight data for first valid data
-            Support fields: weight, group, init_score
        feature_name : list of str
            Feature names
        categorical_feature : list of str or int
@@ -263,12 +271,33 @@ class LGBMModel(LGBMModelBase):
            feval = None
        feval = eval_metric if callable(eval_metric) else None

-        self._Booster = train(params, (X, y),
-                              self.n_estimators, valid_datas=eval_set,
+        def _construct_dataset(X, y, sample_weight, init_score, group):
+            ret = Dataset(X, label=y, weight=sample_weight, group=group)
+            ret.set_init_score(init_score)
+            return ret
+
+        train_set = _construct_dataset(X, y, sample_weight, init_score, group)
+
+        valid_sets = []
+        if eval_set is not None:
+            if isinstance(eval_set, tuple):
+                eval_set = [eval_set]
+            for i, valid_data in enumerate(eval_set):
+                """reduce cost for prediction training data"""
+                if valid_data[0] is X and valid_data[1] is y:
+                    valid_set = train_set
+                else:
+                    valid_weight = None if eval_sample_weight is None else eval_sample_weight.get(i, None)
+                    valid_init_score = None if eval_init_score is None else eval_init_score.get(i, None)
+                    valid_group = None if eval_group is None else eval_group.get(i, None)
+                    valid_set = _construct_dataset(valid_data[0], valid_data[1], valid_weight, valid_init_score, valid_group)
+                valid_sets.append(valid_set)
+
+        self._Booster = train(params, train_set,
+                              self.n_estimators, valid_sets=valid_sets,
                              early_stopping_rounds=early_stopping_rounds,
                              evals_result=evals_result, fobj=self.fobj, feval=feval,
-                              verbose_eval=verbose, train_fields=train_fields,
-                              valid_fields=valid_fields, feature_name=feature_name,
+                              verbose_eval=verbose, feature_name=feature_name,
                              categorical_feature=categorical_feature)

        if evals_result:
@@ -331,14 +360,48 @@ class LGBMRegressor(LGBMModel, LGBMRegressorBase):
    __doc__ = """Implementation of the scikit-learn API for LightGBM regression.
    """ + '\n'.join(LGBMModel.__doc__.split('\n')[2:])

+    def fit(self, X, y,
+            sample_weight=None, init_score=None,
+            eval_set=None, eval_sample_weight=None, 
+            eval_init_score=None,
+            eval_metric=None,
+            early_stopping_rounds=None, verbose=True,
+            feature_name=None, categorical_feature=None,
+            other_params=None):
+
+        super(LGBMRegressor, self).fit(X, y, sample_weight, init_score, None,
+                                    eval_set, eval_sample_weight, eval_init_score, None,
+                                    eval_metric, early_stopping_rounds,
+                                    verbose, feature_name, categorical_feature,
+                                    other_params)
+        return self
+
 class LGBMClassifier(LGBMModel, LGBMClassifierBase):
    __doc__ = """Implementation of the scikit-learn API for LightGBM classification.

    """ + '\n'.join(LGBMModel.__doc__.split('\n')[2:])

-    def fit(self, X, y, eval_set=None, eval_metric=None,
+    def __init__(self, num_leaves=31, max_depth=-1,
+                 learning_rate=0.1, n_estimators=10, max_bin=255,
+                 silent=True, objective="binary",
+                 nthread=-1, min_split_gain=0, min_child_weight=5, min_child_samples=10,
+                 subsample=1, subsample_freq=1, colsample_bytree=1,
+                 reg_alpha=0, reg_lambda=0, scale_pos_weight=1,
+                 is_unbalance=False, seed=0):
+        super(LGBMClassifier, self).__init__(num_leaves, max_depth,
+                                             learning_rate, n_estimators, max_bin,
+                                             silent, objective,
+                                             nthread, min_split_gain, min_child_weight, min_child_samples,
+                                             subsample, subsample_freq, colsample_bytree,
+                                             reg_alpha, reg_lambda, scale_pos_weight,
+                                             is_unbalance, seed)
+
+    def fit(self, X, y,
+            sample_weight=None, init_score=None,
+            eval_set=None, eval_sample_weight=None, 
+            eval_init_score=None,
+            eval_metric=None,
            early_stopping_rounds=None, verbose=True,
-            train_fields=None, valid_fields=None,
            feature_name=None, categorical_feature=None,
            other_params=None):

@@ -350,12 +413,6 @@ class LGBMClassifier(LGBMModel, LGBMClassifierBase):
            # Switch to using a multiclass objective in the underlying LGBM instance
            self.objective = "multiclass"
            other_params['num_class'] = self.n_classes_
-            if eval_metric is None and eval_set is not None:
-                eval_metric = "multi_logloss"
-        else:
-            self.objective = "binary"
-            if eval_metric is None and eval_set is not None:
-                eval_metric = "binary_logloss"

        self._le = LGBMLabelEncoder().fit(y)
        training_labels = self._le.transform(y)
@@ -363,10 +420,10 @@ class LGBMClassifier(LGBMModel, LGBMClassifierBase):
        if eval_set is not None:
            eval_set = list((x[0], self._le.transform(x[1])) for x in eval_set)

-        super(LGBMClassifier, self).fit(X, training_labels, eval_set,
+        super(LGBMClassifier, self).fit(X, training_labels, sample_weight, init_score, None,
+                                        eval_set, eval_sample_weight, eval_init_score, None,
                                        eval_metric, early_stopping_rounds,
-                                        verbose, train_fields, valid_fields,
-                                        feature_name, categorical_feature,
+                                        verbose, feature_name, categorical_feature,
                                        other_params)
        return self

@@ -442,34 +499,59 @@ class LGBMRanker(LGBMModel):

    """ + '\n'.join(LGBMModel.__doc__.split('\n')[2:])

-    def fit(self, X, y, eval_set=None, eval_metric=None,
+    def __init__(self, num_leaves=31, max_depth=-1,
+                 learning_rate=0.1, n_estimators=10, max_bin=255,
+                 silent=True, objective="lambdarank",
+                 nthread=-1, min_split_gain=0, min_child_weight=5, min_child_samples=10,
+                 subsample=1, subsample_freq=1, colsample_bytree=1,
+                 reg_alpha=0, reg_lambda=0, scale_pos_weight=1,
+                 is_unbalance=False, seed=0):
+        super(LGBMRanker, self).__init__(num_leaves, max_depth,
+                                             learning_rate, n_estimators, max_bin,
+                                             silent, objective,
+                                             nthread, min_split_gain, min_child_weight, min_child_samples,
+                                             subsample, subsample_freq, colsample_bytree,
+                                             reg_alpha, reg_lambda, scale_pos_weight,
+                                             is_unbalance, seed)
+        if callable(self.objective):
+            self.fobj = _group_wise_objective(self.objective)
+        else:
+            self.fobj = None
+
+    def fit(self, X, y,
+            sample_weight=None, init_score=None, group=None, 
+            eval_set=None, eval_sample_weight=None, 
+            eval_init_score=None, eval_group=None,
+            eval_metric=None, eval_at=None,
            early_stopping_rounds=None, verbose=True,
-            train_fields=None, valid_fields=None, other_params=None):
+            feature_name=None, categorical_feature=None,
+            other_params=None):
+        """
+        Most arguments like LGBMModel.fit except following:
+
+        eval_at : list of int
+            The evaulation positions of NDCG
+        """

        """check group data"""
-        if "group" not in train_fields:
-            raise ValueError("should set group in train_fields for ranking task")
+        if group is None:
+            raise ValueError("should use group for ranking task")

        if eval_set is not None:
-            if valid_fields is None:
-                raise ValueError("valid_fields cannot be None when eval_set is not None")
-            elif len(valid_fields) != len(eval_set):
-                raise ValueError("lenght of valid_fields should equal with eval_set")
+            if eval_group is None:
+                raise ValueError("eval_group cannot be None when eval_set is not None")
+            elif len(eval_group) != len(eval_set):
+                raise ValueError("length of eval_group should equal with eval_set")
            else:
-                for inner in valid_fields:
-                    if "group" not in inner:
-                        raise ValueError("should set group in valid_fields for ranking task")
-
-        if callable(self.objective):
-            self.fobj = _group_wise_objective(self.objective)
-        else:
-            self.objective = "lambdarank"
-            self.fobj = None
-            if eval_metric is None and eval_set is not None:
-                eval_metric = "ndcg"
-
-        super(LGBMRanker, self).fit(X, y, eval_set, eval_metric,
-                                    early_stopping_rounds, verbose,
-                                    train_fields, valid_fields,
+                for inner_group in eval_group:
+                    if inner_group is None:
+                        raise ValueError("should set group for all eval data for ranking task")
+        if eval_at is not None:
+            other_params = {} if other_params is None else other_params
+            other_params['ndcg_eval_at'] = list(eval_at)
+        super(LGBMRanker, self).fit(X, y, sample_weight, init_score, group,
+                                    eval_set, eval_sample_weight, eval_init_score, eval_group,
+                                    eval_metric, early_stopping_rounds,
+                                    verbose, feature_name, categorical_feature,
                                    other_params)
        return self
--- a/src/io/config.cpp
+++ b/src/io/config.cpp
 #include <LightGBM/config.h>

 #include <LightGBM/utils/common.h>
+#include <LightGBM/utils/random.h>
 #include <LightGBM/utils/log.h>

 #include <vector>
 #include <string>
 #include <unordered_set>
 #include <algorithm>
+#include <limits>

 namespace LightGBM {

@@ -22,7 +24,7 @@ std::unordered_map<std::string, std::string> ConfigBase::Str2Map(const char* par
        continue;
      }
      params[key] = value;
-    } else {
+    } else if(Common::Trim(arg).size() > 0){
      Log::Warning("Unknown parameter %s", arg.c_str());
    }
  }
@@ -33,12 +35,21 @@ std::unordered_map<std::string, std::string> ConfigBase::Str2Map(const char* par
 void OverallConfig::Set(const std::unordered_map<std::string, std::string>& params) {
  // load main config types
  GetInt(params, "num_threads", &num_threads);
+
+  // generate seeds by seed.
+  if (GetInt(params, "seed", &seed)) {
+    Random rand(seed);
+    int int_max = std::numeric_limits<int>::max();
+    io_config.data_random_seed = static_cast<int>(rand.NextInt(0, int_max));
+    boosting_config.bagging_seed = static_cast<int>(rand.NextInt(0, int_max));
+    boosting_config.drop_seed = static_cast<int>(rand.NextInt(0, int_max));
+    boosting_config.tree_config.feature_fraction_seed = static_cast<int>(rand.NextInt(0, int_max));
+  }
  GetTaskType(params);
  GetBoostingType(params);
  GetObjectiveType(params);
  GetMetricType(params);

-
  // sub-config setup
  network_config.Set(params);
  io_config.Set(params);

--- a/tests/python_package_test/test_basic.py
+++ b/tests/python_package_test/test_basic.py
@@ -8,10 +8,6 @@ x_train, x_test, y_train, y_test = model_selection.train_test_split(X, Y, test_s

 train_data = lgb.Dataset(x_train, max_bin=255, label=y_train)

-num_features = train_data.num_feature()
-names = ["name_%d" %(i) for i in range(num_features)]
-train_data.set_feature_name(names)
-
 valid_data = train_data.create_valid(x_test, label=y_test)

 config={"objective":"binary","metric":"auc", "min_data":1, "num_leaves":15}

--- a/tests/python_package_test/test_engine.py
+++ b/tests/python_package_test/test_engine.py
+# coding: utf-8
+# pylint: disable = invalid-name, C0111
+import json
+import lightgbm as lgb
+import pandas as pd
+from sklearn.metrics import mean_squared_error
+
+# load or create your dataset
+df_train = pd.read_csv('../../examples/regression/regression.train', header=None, sep='\t')
+df_test = pd.read_csv('../../examples/regression/regression.test', header=None, sep='\t')
+
+y_train = df_train[0]
+y_test = df_test[0]
+X_train = df_train.drop(0, axis=1)
+X_test = df_test.drop(0, axis=1)
+
+# create dataset for lightgbm
+lgb_train = lgb.Dataset(X_train, y_train, free_raw_data=False)
+lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train,free_raw_data=False)
+
+
+# specify your configurations as a dict
+params = {
+    'task' : 'train',
+    'boosting_type' : 'gbdt',
+    'objective' : 'regression',
+    'metric' : {'l2', 'auc'},
+    'num_leaves' : 31,
+    'learning_rate' : 0.05,
+    'feature_fraction' : 0.9,
+    'bagging_fraction' : 0.8,
+    'bagging_freq': 5,
+    'verbose' : 0
+}
+
+# train
+init_gbm = lgb.train(params,
+                lgb_train,
+                num_boost_round=5,
+                valid_sets=lgb_eval)
+
+print('Start continue train')
+
+gbm = lgb.train(params,
+                lgb_train,
+                num_boost_round=100,
+                valid_sets=lgb_eval,
+                early_stopping_rounds=10,
+                init_model=init_gbm)
+
+
+# save model to file
+gbm.save_model('model.txt')
+
+# predict
+y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
+# eval
+print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)
+
+# dump model to json (and save to file)
+model_json = gbm.dump_model()
+
+with open('model.json', 'w+') as f:
+    json.dump(model_json, f, indent=4)
+
+# feature importances
+print('Feature importances:', gbm.feature_importance())
+print('Feature importances:', gbm.feature_importance("gain"))
+
+print('Start test cv')
+
+lgb.cv(params,
+        lgb_train,
+        num_boost_round=100,
+        nfold=5,
+        verbose_eval=5,
+        init_model=init_gbm)