[tests][dask] test all boosting types (fixes #3896) (#4119)

* test all boosting types * lint * bring scores comparison back and set y as second argument in assert_eq

[tests][dask] test all boosting types (fixes #3896) (#4119)
* test all boosting types * lint * bring scores comparison back and set y as second argument in assert_eq
f879018b · jmoralez · GitHub · e4cf2e4f · f879018b
Unverified Commit f879018b authored Mar 30, 2021 by jmoralez Committed by GitHub Mar 30, 2021
Show whitespace changes
Inline Side-by-side

Showing with 39 additions and 10 deletions

tests/python_package_test/test_dask.py tests/python_package_test/test_dask.py +39 -10

No files found.
--- a/tests/python_package_test/test_dask.py
+++ b/tests/python_package_test/test_dask.py
@@ -45,6 +45,7 @@ CLIENT_CLOSE_TIMEOUT = 120

 tasks = ['binary-classification', 'multiclass-classification', 'regression', 'ranking']
 data_output = ['array', 'scipy_csr_matrix', 'dataframe', 'dataframe-with-categorical']
+boosting_types = ['gbdt', 'dart', 'goss', 'rf']
 group_sizes = [5, 5, 5, 10, 10, 10, 20, 20, 20, 50, 50]
 task_to_dask_factory = {
    'regression': lgb.DaskLGBMRegressor,
@@ -233,16 +234,25 @@ def _unpickle(filepath, serializer):

 @pytest.mark.parametrize('output', data_output)
 @pytest.mark.parametrize('task', ['binary-classification', 'multiclass-classification'])
-def test_classifier(output, task, client):
+@pytest.mark.parametrize('boosting_type', boosting_types)
+def test_classifier(output, task, boosting_type, client):
    X, y, w, _, dX, dy, dw, _ = _create_data(
        objective=task,
        output=output
    )

    params = {
+        "boosting_type": boosting_type,
        "n_estimators": 50,
        "num_leaves": 31
    }
+    if boosting_type == 'rf':
+        params.update({
+            'bagging_freq': 1,
+            'bagging_fraction': 0.9,
+        })
+    elif boosting_type == 'goss':
+        params['top_rate'] = 0.5

    dask_classifier = lgb.DaskLGBMClassifier(
        client=client,
@@ -263,13 +273,18 @@ def test_classifier(output, task, client):
    p2_proba = local_classifier.predict_proba(X)
    s2 = local_classifier.score(X, y)

+    if boosting_type == 'rf' and output == 'dataframe-with-categorical':
+        # https://github.com/microsoft/LightGBM/issues/4118
+        assert_eq(s1, s2, atol=0.01)
+        assert_eq(p1_proba, p2_proba, atol=0.8)
+    else:
        assert_eq(s1, s2)
        assert_eq(p1, p2)
-    assert_eq(y, p1)
-    assert_eq(y, p2)
-    assert_eq(p1_proba, p2_proba, atol=0.01)
+        assert_eq(p1, y)
+        assert_eq(p2, y)
+        assert_eq(p1_proba, p2_proba, atol=0.03)
        assert_eq(p1_local, p2)
-    assert_eq(y, p1_local)
+        assert_eq(p1_local, y)

    # pref_leaf values should have the right shape
    # and values that look like valid tree nodes
@@ -401,17 +416,24 @@ def test_training_does_not_fail_on_port_conflicts(client):


 @pytest.mark.parametrize('output', data_output)
-def test_regressor(output, client):
+@pytest.mark.parametrize('boosting_type', boosting_types)
+def test_regressor(output, boosting_type, client):
    X, y, w, _, dX, dy, dw, _ = _create_data(
        objective='regression',
        output=output
    )

    params = {
+        "boosting_type": boosting_type,
        "random_state": 42,
        "num_leaves": 31,
        "n_estimators": 20,
    }
+    if boosting_type == 'rf':
+        params.update({
+            'bagging_freq': 1,
+            'bagging_fraction': 0.9,
+        })

    dask_regressor = lgb.DaskLGBMRegressor(
        client=client,
@@ -569,7 +591,8 @@ def test_regressor_quantile(output, client, alpha):

 @pytest.mark.parametrize('output', ['array', 'dataframe', 'dataframe-with-categorical'])
 @pytest.mark.parametrize('group', [None, group_sizes])
-def test_ranker(output, client, group):
+@pytest.mark.parametrize('boosting_type', boosting_types)
+def test_ranker(output, group, boosting_type, client):
    if output == 'dataframe-with-categorical':
        X, y, w, g, dX, dy, dw, dg = _create_data(
            objective='ranking',
@@ -597,11 +620,17 @@ def test_ranker(output, client, group):
    # use many trees + leaves to overfit, help ensure that Dask data-parallel strategy matches that of
    # serial learner. See https://github.com/microsoft/LightGBM/issues/3292#issuecomment-671288210.
    params = {
+        "boosting_type": boosting_type,
        "random_state": 42,
        "n_estimators": 50,
        "num_leaves": 20,
        "min_child_samples": 1
    }
+    if boosting_type == 'rf':
+        params.update({
+            'bagging_freq': 1,
+            'bagging_fraction': 0.9,
+        })

    dask_ranker = lgb.DaskLGBMRanker(
        client=client,