add min_data, fix test

f8267a50 · Guolin Ke · f65164f6 · f8267a50 · f8267a50
Commit f8267a50 authored Nov 30, 2016 by Guolin Ke
Show whitespace changes
Inline Side-by-side

Showing with 16 additions and 13 deletions

python-package/lightgbm/sklearn.py python-package/lightgbm/sklearn.py +7 -4

tests/python_package_test/test_sklearn.py tests/python_package_test/test_sklearn.py +9 -9

No files found.
--- a/python-package/lightgbm/sklearn.py
+++ b/python-package/lightgbm/sklearn.py
@@ -80,7 +80,9 @@ class LGBMModel(LGBMModelBase):
    gamma : float
        Minimum loss reduction required to make a further partition on a leaf node of the tree.
    min_child_weight : int
-        Minimum sum of instance weight(hessian) needed in a child.
+        Minimum sum of instance weight(hessian) needed in a child(leaf)
+    min_data : int
+        Minimum number of data need in a child(leaf)
    subsample : float
        Subsample ratio of the training instance.
    subsample_freq : int
@@ -121,10 +123,10 @@ class LGBMModel(LGBMModelBase):
          and you should group grad and hess in this way as well
    """

-    def __init__(self, num_leaves=63, max_depth=-1, 
-                 learning_rate=0.1, n_estimators=100, max_bin=255,
+    def __init__(self, num_leaves=31, max_depth=-1, 
+                 learning_rate=0.1, n_estimators=10, max_bin=255,
                 silent=True, objective="regression",  
-                 nthread=-1, gamma=0, min_child_weight=1,
+                 nthread=-1, gamma=0, min_child_weight=5, min_data=10,
                 subsample=1, subsample_freq=1, colsample_bytree=1, colsample_byleaf=1, 
                 reg_alpha=0, reg_lambda=0, scale_pos_weight=1, 
                 is_unbalance=False, seed=0):
@@ -141,6 +143,7 @@ class LGBMModel(LGBMModelBase):
        self.nthread = nthread
        self.gamma = gamma
        self.min_child_weight = min_child_weight
+        self.min_data = min_data
        self.subsample = subsample
        self.subsample_freq = subsample_freq
        self.colsample_bytree = colsample_bytree

--- a/tests/python_package_test/test_sklearn.py
+++ b/tests/python_package_test/test_sklearn.py
@@ -10,13 +10,13 @@ def test_binary_classification():
    from sklearn import datasets, metrics, model_selection

    X, y = datasets.make_classification(n_samples=10000, n_features=100)
-    x_train, x_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.1)
+    x_train, x_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.1, random_state=1)
    lgb_model = lgb.LGBMClassifier().fit(x_train, y_train)
    from sklearn.datasets import load_digits
    digits = load_digits(2)
    y = digits['target']
    X = digits['data']
-    x_train, x_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2)
+    x_train, x_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.1, random_state=1)
    lgb_model = lgb.LGBMClassifier().fit(x_train, y_train)
    preds = lgb_model.predict(x_test)
    err = sum(1 for i in range(len(preds))
@@ -35,7 +35,7 @@ def test_multiclass_classification():

    X, y = datasets.make_classification(n_samples=10000, n_features=100, n_classes=4, n_informative=3)

-    x_train, x_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.1)
+    x_train, x_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.1, random_state=1)

    lgb_model = lgb.LGBMClassifier().fit(x_train, y_train)
    preds = lgb_model.predict(x_test)
@@ -51,10 +51,10 @@ def test_regression():
    boston = load_boston()
    y = boston['target']
    X = boston['data']
-    x_train, x_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.1)
+    x_train, x_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.1, random_state=1)
    lgb_model = lgb.LGBMRegressor().fit(x_train, y_train)
    preds = lgb_model.predict(x_test)
-    assert mean_squared_error(preds, y_test) < 40
+    assert mean_squared_error(preds, y_test) < 100

 def test_regression_with_custom_objective():
    from sklearn.metrics import mean_squared_error
@@ -68,10 +68,10 @@ def test_regression_with_custom_objective():
    boston = load_boston()
    y = boston['target']
    X = boston['data']
-    x_train, x_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.1)
+    x_train, x_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.1, random_state=1)
    lgb_model = lgb.LGBMRegressor(objective=objective_ls).fit(x_train, y_train)
    preds = lgb_model.predict(x_test)
-    assert mean_squared_error(preds, y_test) < 40
+    assert mean_squared_error(preds, y_test) < 100


 def test_binary_classification_with_custom_objective():
@@ -83,13 +83,13 @@ def test_binary_classification_with_custom_objective():
        hess = y_pred * (1.0 - y_pred)
        return grad, hess
    X, y = datasets.make_classification(n_samples=10000, n_features=100)
-    x_train, x_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.1)
+    x_train, x_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.1, random_state=1)
    lgb_model = lgb.LGBMClassifier(objective=logregobj).fit(x_train, y_train)
    from sklearn.datasets import load_digits
    digits = load_digits(2)
    y = digits['target']
    X = digits['data']
-    x_train, x_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2)
+    x_train, x_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2, random_state=1)
    lgb_model = lgb.LGBMClassifier(objective=logregobj).fit(x_train, y_train)
    preds = lgb_model.predict(x_test)
    err = sum(1 for i in range(len(preds))