add dart parameter to sklearn interface (#145)

* remove other parameters in sklearn fit * add dart parameters to sklearn init

add dart parameter to sklearn interface (#145)
* remove other parameters in sklearn fit * add dart parameters to sklearn init
49178def · wxchan · Guolin Ke · 616388e0 · 49178def · 49178def
Commit 49178def authored Dec 28, 2016 by wxchan Committed by Guolin Ke Dec 28, 2016
Showing with 124 additions and 101 deletions

docs/Python-API.md docs/Python-API.md +13 -15

python-package/lightgbm/sklearn.py python-package/lightgbm/sklearn.py +99 -82

tests/python_package_test/test_sklearn.py tests/python_package_test/test_sklearn.py +12 -4

No files found.
--- a/docs/Python-API.md
+++ b/docs/Python-API.md
@@ -61,11 +61,6 @@ The methods of each Class is in alphabetical order.
        True if need to free raw data after construct inner dataset
    

-####construct()
-
-    Lazy init
-    
-
 ####create_valid(data, label=None, weight=None, group=None, silent=False, params=None)

    Create validation data align with current dataset.
@@ -628,6 +623,16 @@ The methods of each Class is in alphabetical order.
        Is unbalance for binary classification
    seed : int
        Random number seed.
+    drop_rate : float
+        Only used when boosting_type='dart'. Probablity to select dropping trees.
+    skip_drop : float
+        Only used when boosting_type='dart'. Probablity to skip dropping trees.
+    max_drop : int
+        Only used when boosting_type='dart'. Max number of dropped trees in one iteration.
+    uniform_drop : bool
+        Only used when boosting_type='dart'. If true, drop trees uniformly, else drop according to weights.
+    xgboost_dart_mode : bool
+        Only used when boosting_type='dart'. Whether use xgboost dart mode.

    Note
    ----
@@ -698,7 +703,7 @@ The methods of each Class is in alphabetical order.
        Array of normailized feature importances
    

-####fit(X, y, sample_weight=None, init_score=None, group=None, eval_set=None, eval_sample_weight=None, eval_init_score=None, eval_group=None, eval_metric=None, early_stopping_rounds=None, verbose=True, feature_name=None, categorical_feature=None, other_params=None)
+####fit(X, y, sample_weight=None, init_score=None, group=None, eval_set=None, eval_sample_weight=None, eval_init_score=None, eval_group=None, eval_metric=None, early_stopping_rounds=None, verbose=True, feature_name=None, categorical_feature=None)

    Fit the gradient boosting model.

@@ -735,8 +740,6 @@ The methods of each Class is in alphabetical order.
        Categorical features,
        type int represents index,
        type str represents feature names (need to specify feature_name as well)
-    other_params: dict
-        Other parameters

    Note
    ----
@@ -762,12 +765,7 @@ The methods of each Class is in alphabetical order.
            is eval result bigger better, e.g. AUC is bigger_better.
    for multi-class task, the y_pred is group by class_id first, then group by row_id
      if you want to get i-th row y_pred in j-th class, the access way is y_pred[j*num_data+i]
-    
-
-####get_params(deep=False)

-    Get parameters.
-    

 ####predict(data, raw_score=False, num_iteration=0)

@@ -809,14 +807,14 @@ The methods of each Class is in alphabetical order.

 ###LGBMRanker

-####fit(X, y, sample_weight=None, init_score=None, group=None, eval_set=None, eval_sample_weight=None, eval_init_score=None, eval_group=None, eval_metric='ndcg', eval_at=1, early_stopping_rounds=None, verbose=True, feature_name=None, categorical_feature=None, other_params=None)
+####fit(X, y, sample_weight=None, init_score=None, group=None, eval_set=None, eval_sample_weight=None, eval_init_score=None, eval_group=None, eval_metric='ndcg', eval_at=1, early_stopping_rounds=None, verbose=True, feature_name=None, categorical_feature=None)

    Most arguments are same as Common Methods except:

    eval_at : int or list of int, default=1
        The evaulation positions of NDCG

-## Callbacks
+##Callbacks

 ###Before iteration


--- a/python-package/lightgbm/sklearn.py
+++ b/python-package/lightgbm/sklearn.py
@@ -138,7 +138,9 @@ class LGBMModel(LGBMModelBase):
                 nthread=-1, min_split_gain=0, min_child_weight=5, min_child_samples=10,
                 subsample=1, subsample_freq=1, colsample_bytree=1,
                 reg_alpha=0, reg_lambda=0, scale_pos_weight=1,
-                 is_unbalance=False, seed=0):
+                 is_unbalance=False, seed=0,
+                 drop_rate=0.1, skip_drop=0.5, max_drop=50,
+                 uniform_drop=False, xgboost_dart_mode=False):
        """
        Implementation of the Scikit-Learn API for LightGBM.

@@ -187,6 +189,16 @@ class LGBMModel(LGBMModelBase):
            Is unbalance for binary classification
        seed : int
            Random number seed.
+        drop_rate : float
+            Only used when boosting_type='dart'. Probablity to select dropping trees.
+        skip_drop : float
+            Only used when boosting_type='dart'. Probablity to skip dropping trees.
+        max_drop : int
+            Only used when boosting_type='dart'. Max number of dropped trees in one iteration.
+        uniform_drop : bool
+            Only used when boosting_type='dart'. If true, drop trees uniformly, else drop according to weights.
+        xgboost_dart_mode : bool
+            Only used when boosting_type='dart'. Whether use xgboost dart mode.

        Note
        ----
@@ -233,6 +245,11 @@ class LGBMModel(LGBMModelBase):
        self.scale_pos_weight = scale_pos_weight
        self.is_unbalance = is_unbalance
        self.seed = seed
+        self.drop_rate = drop_rate
+        self.skip_drop = skip_drop
+        self.max_drop = max_drop
+        self.uniform_drop = uniform_drop
+        self.xgboost_dart_mode = xgboost_dart_mode
        self._Booster = None
        self.best_iteration = -1
        if callable(self.objective):
@@ -253,23 +270,13 @@ class LGBMModel(LGBMModelBase):
            raise LightGBMError('Need to call fit beforehand')
        return self._Booster

-    def get_params(self, deep=False):
-        """
-        Get parameters
-        """
-        params = super(LGBMModel, self).get_params(deep=deep)
-        if self.nthread <= 0:
-            params.pop('nthread', None)
-        return params
-
    def fit(self, X, y,
            sample_weight=None, init_score=None, group=None,
            eval_set=None, eval_sample_weight=None,
            eval_init_score=None, eval_group=None,
            eval_metric=None,
            early_stopping_rounds=None, verbose=True,
-            feature_name=None, categorical_feature=None,
-            other_params=None):
+            feature_name=None, categorical_feature=None):
        """
        Fit the gradient boosting model

@@ -305,8 +312,6 @@ class LGBMModel(LGBMModelBase):
            Categorical features,
            type int represents index,
            type str represents feature names (need to specify feature_name as well)
-        other_params: dict
-            Other parameters

        Note
        ----
@@ -335,10 +340,11 @@ class LGBMModel(LGBMModelBase):
        """
        evals_result = {}
        params = self.get_params()
-        params['verbose'] = 0 if self.silent else 1
-
-        if other_params is not None:
-            params.update(other_params)
+        params['verbose'] = -1 if self.silent else 1
+        if hasattr(self, 'n_classes_') and self.n_classes_ > 2:
+            params['num_class'] = self.n_classes_
+        if hasattr(self, 'eval_at'):
+            params['ndcg_eval_at'] = self.eval_at

        if self.fobj:
            params["objective"] = "None"
@@ -408,9 +414,9 @@ class LGBMModel(LGBMModelBase):
        -------
        predicted_result : array_like, shape=[n_samples] or [n_samples, n_classes]
        """
-        return self.booster().predict(data,
-                                      raw_score=raw_score,
-                                      num_iteration=num_iteration)
+        return self._Booster.predict(data,
+                                     raw_score=raw_score,
+                                     num_iteration=num_iteration)

    def apply(self, X, num_iteration=0):
        """
@@ -428,9 +434,9 @@ class LGBMModel(LGBMModelBase):
        -------
        X_leaves : array_like, shape=[n_samples, n_trees]
        """
-        return self.booster().predict(X,
-                                      pred_leaf=True,
-                                      num_iteration=num_iteration)
+        return self._Booster.predict(X,
+                                     pred_leaf=True,
+                                     num_iteration=num_iteration)

    def evals_result(self):
        """
@@ -466,14 +472,16 @@ class LGBMRegressor(LGBMModel, LGBMRegressorBase):
            eval_init_score=None,
            eval_metric="l2",
            early_stopping_rounds=None, verbose=True,
-            feature_name=None, categorical_feature=None,
-            other_params=None):
-
-        super(LGBMRegressor, self).fit(X, y, sample_weight, init_score, None,
-                                       eval_set, eval_sample_weight, eval_init_score, None,
-                                       eval_metric, early_stopping_rounds,
-                                       verbose, feature_name, categorical_feature,
-                                       other_params)
+            feature_name=None, categorical_feature=None):
+
+        super(LGBMRegressor, self).fit(X, y, sample_weight=sample_weight,
+                                       init_score=init_score, eval_set=eval_set,
+                                       eval_sample_weight=eval_sample_weight,
+                                       eval_init_score=eval_init_score,
+                                       eval_metric=eval_metric,
+                                       early_stopping_rounds=early_stopping_rounds,
+                                       verbose=verbose, feature_name=feature_name,
+                                       categorical_feature=categorical_feature)
        return self

 class LGBMClassifier(LGBMModel, LGBMClassifierBase):
@@ -484,14 +492,20 @@ class LGBMClassifier(LGBMModel, LGBMClassifierBase):
                 nthread=-1, min_split_gain=0, min_child_weight=5, min_child_samples=10,
                 subsample=1, subsample_freq=1, colsample_bytree=1,
                 reg_alpha=0, reg_lambda=0, scale_pos_weight=1,
-                 is_unbalance=False, seed=0):
-        super(LGBMClassifier, self).__init__(boosting_type, num_leaves, max_depth,
-                                             learning_rate, n_estimators, max_bin,
-                                             silent, objective, nthread,
-                                             min_split_gain, min_child_weight, min_child_samples,
-                                             subsample, subsample_freq, colsample_bytree,
-                                             reg_alpha, reg_lambda, scale_pos_weight,
-                                             is_unbalance, seed)
+                 is_unbalance=False, seed=0,
+                 drop_rate=0.1, skip_drop=0.5, max_drop=50,
+                 uniform_drop=False, xgboost_dart_mode=False):
+        super(LGBMClassifier, self).__init__(boosting_type=boosting_type, num_leaves=num_leaves,
+                                             max_depth=max_depth, learning_rate=learning_rate,
+                                             n_estimators=n_estimators, max_bin=max_bin,
+                                             silent=silent, objective=objective, nthread=nthread,
+                                             min_split_gain=min_split_gain, min_child_weight=min_child_weight,
+                                             min_child_samples=min_child_samples, subsample=subsample,
+                                             subsample_freq=subsample_freq, colsample_bytree=colsample_bytree,
+                                             reg_alpha=reg_alpha, reg_lambda=reg_lambda,
+                                             scale_pos_weight=scale_pos_weight, is_unbalance=is_unbalance, seed=seed,
+                                             drop_rate=drop_rate, skip_drop=skip_drop, max_drop=max_drop,
+                                             uniform_drop=uniform_drop, xgboost_dart_mode=xgboost_dart_mode)

    def fit(self, X, y,
            sample_weight=None, init_score=None,
@@ -499,37 +513,35 @@ class LGBMClassifier(LGBMModel, LGBMClassifierBase):
            eval_init_score=None,
            eval_metric="binary_logloss",
            early_stopping_rounds=None, verbose=True,
-            feature_name=None, categorical_feature=None,
-            other_params=None):
+            feature_name=None, categorical_feature=None):

-        self.classes_ = np.unique(y)
-        self.n_classes_ = len(self.classes_)
-        if other_params is None:
-            other_params = {}
+        self._le = LGBMLabelEncoder().fit(y)
+        y = self._le.transform(y)
+
+        self.n_classes_ = len(self._le.classes_)
        if self.n_classes_ > 2:
            # Switch to using a multiclass objective in the underlying LGBM instance
            self.objective = "multiclass"
-            other_params['num_class'] = self.n_classes_
            if eval_set is not None and eval_metric == "binary_logloss":
                eval_metric = "multi_logloss"

-        self._le = LGBMLabelEncoder().fit(y)
-        training_labels = self._le.transform(y)
-
        if eval_set is not None:
-            eval_set = list((x[0], self._le.transform(x[1])) for x in eval_set)
-
-        super(LGBMClassifier, self).fit(X, training_labels, sample_weight, init_score, None,
-                                        eval_set, eval_sample_weight, eval_init_score, None,
-                                        eval_metric, early_stopping_rounds,
-                                        verbose, feature_name, categorical_feature,
-                                        other_params)
+            eval_set = [(x[0], self._le.transform(x[1])) for x in eval_set]
+
+        super(LGBMClassifier, self).fit(X, y, sample_weight=sample_weight,
+                                        init_score=init_score, eval_set=eval_set,
+                                        eval_sample_weight=eval_sample_weight,
+                                        eval_init_score=eval_init_score,
+                                        eval_metric=eval_metric,
+                                        early_stopping_rounds=early_stopping_rounds,
+                                        verbose=verbose, feature_name=feature_name,
+                                        categorical_feature=categorical_feature)
        return self

    def predict(self, data, raw_score=False, num_iteration=0):
-        class_probs = self.booster().predict(data,
-                                             raw_score=raw_score,
-                                             num_iteration=num_iteration)
+        class_probs = self._Booster.predict(data,
+                                            raw_score=raw_score,
+                                            num_iteration=num_iteration)
        if len(class_probs.shape) > 1:
            column_indexes = np.argmax(class_probs, axis=1)
        else:
@@ -553,9 +565,9 @@ class LGBMClassifier(LGBMModel, LGBMClassifierBase):
        -------
        predicted_probability : array_like, shape=[n_samples, n_classes]
        """
-        class_probs = self.booster().predict(data,
-                                             raw_score=raw_score,
-                                             num_iteration=num_iteration)
+        class_probs = self._Booster.predict(data,
+                                            raw_score=raw_score,
+                                            num_iteration=num_iteration)
        if self.n_classes_ > 2:
            return class_probs
        else:
@@ -571,14 +583,20 @@ class LGBMRanker(LGBMModel):
                 nthread=-1, min_split_gain=0, min_child_weight=5, min_child_samples=10,
                 subsample=1, subsample_freq=1, colsample_bytree=1,
                 reg_alpha=0, reg_lambda=0, scale_pos_weight=1,
-                 is_unbalance=False, seed=0):
-        super(LGBMRanker, self).__init__(boosting_type, num_leaves, max_depth,
-                                         learning_rate, n_estimators, max_bin,
-                                         silent, objective, nthread,
-                                         min_split_gain, min_child_weight, min_child_samples,
-                                         subsample, subsample_freq, colsample_bytree,
-                                         reg_alpha, reg_lambda, scale_pos_weight,
-                                         is_unbalance, seed)
+                 is_unbalance=False, seed=0,
+                 drop_rate=0.1, skip_drop=0.5, max_drop=50,
+                 uniform_drop=False, xgboost_dart_mode=False):
+        super(LGBMRanker, self).__init__(boosting_type=boosting_type, num_leaves=num_leaves,
+                                         max_depth=max_depth, learning_rate=learning_rate,
+                                         n_estimators=n_estimators, max_bin=max_bin,
+                                         silent=silent, objective=objective, nthread=nthread,
+                                         min_split_gain=min_split_gain, min_child_weight=min_child_weight,
+                                         min_child_samples=min_child_samples, subsample=subsample,
+                                         subsample_freq=subsample_freq, colsample_bytree=colsample_bytree,
+                                         reg_alpha=reg_alpha, reg_lambda=reg_lambda,
+                                         scale_pos_weight=scale_pos_weight, is_unbalance=is_unbalance, seed=seed,
+                                         drop_rate=drop_rate, skip_drop=skip_drop, max_drop=max_drop,
+                                         uniform_drop=uniform_drop, xgboost_dart_mode=xgboost_dart_mode)

    def fit(self, X, y,
            sample_weight=None, init_score=None, group=None,
@@ -586,8 +604,7 @@ class LGBMRanker(LGBMModel):
            eval_init_score=None, eval_group=None,
            eval_metric='ndcg', eval_at=1,
            early_stopping_rounds=None, verbose=True,
-            feature_name=None, categorical_feature=None,
-            other_params=None):
+            feature_name=None, categorical_feature=None):
        """
        Most arguments like common methods except following:

@@ -610,13 +627,13 @@ class LGBMRanker(LGBMModel):
                        raise ValueError("Should set group for all eval dataset for ranking task")

        if eval_at is not None:
-            other_params = {} if other_params is None else other_params
-            if isinstance(eval_at, int):
-                eval_at = [eval_at]
-            other_params['ndcg_eval_at'] = list(eval_at)
-        super(LGBMRanker, self).fit(X, y, sample_weight, init_score, group,
-                                    eval_set, eval_sample_weight, eval_init_score, eval_group,
-                                    eval_metric, early_stopping_rounds,
-                                    verbose, feature_name, categorical_feature,
-                                    other_params)
+            self.eval_at = eval_at
+        super(LGBMRanker, self).fit(X, y, sample_weight=sample_weight,
+                                    init_score=init_score, group=group,
+                                    eval_set=eval_set, eval_sample_weight=eval_sample_weight,
+                                    eval_init_score=eval_init_score, eval_group=eval_group,
+                                    eval_metric=eval_metric,
+                                    early_stopping_rounds=early_stopping_rounds,
+                                    verbose=verbose, feature_name=feature_name,
+                                    categorical_feature=categorical_feature)
        return self
--- a/tests/python_package_test/test_sklearn.py
+++ b/tests/python_package_test/test_sklearn.py
@@ -16,7 +16,8 @@ def test_template(X_y=load_boston(True), model=lgb.LGBMRegressor,
                                                        stratify=stratify,
                                                        random_state=42)
    if return_data: return X_train, X_test, y_train, y_test
-    gbm = model(n_estimators=num_round, objective=custom_obj) if custom_obj else model(n_estimators=num_round)
+    if not custom_obj: gbm = model(n_estimators=num_round, silent=True)
+    else: gbm = model(n_estimators=num_round, objective=custom_obj, silent=True)
    gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=10, verbose=False)
    if return_model: return gbm
    else: return feval(y_test, gbm.predict_proba(X_test) if proba else gbm.predict(X_test))
@@ -64,12 +65,19 @@ class TestSklearn(unittest.TestCase):
        ret = test_template(X_y, lgb.LGBMClassifier, feval=binary_error, custom_obj=logregobj)
        self.assertLess(ret, 0.1)

+    def test_dart(self):
+        X_train, X_test, y_train, y_test = test_template(return_data=True)
+        gbm = lgb.LGBMRegressor(boosting_type='dart')
+        gbm.fit(X_train, y_train)
+        self.assertLessEqual(gbm.score(X_train, y_train), 1.)
+
    def test_grid_search(self):
        X_train, X_test, y_train, y_test = test_template(return_data=True)
-        params = {'n_estimators': [10, 15, 20]}
-        gbm = GridSearchCV(lgb.LGBMRegressor(), params, cv=5)
+        params = {'boosting_type': ['dart', 'gbdt'],
+                  'n_estimators': [15, 20], 'drop_rate':[0.1, 0.2]}
+        gbm = GridSearchCV(lgb.LGBMRegressor(), params, cv=3)
        gbm.fit(X_train, y_train)
-        self.assertIn(gbm.best_params_['n_estimators'], [10, 15, 20])
+        self.assertIn(gbm.best_params_['n_estimators'], [15, 20])

    def test_clone(self):
        gbm = test_template(return_model=True)