[tests][dask] Create an informative categorical feature (#4113)

* make one categorical variable informative. increase n_samples. reduce n_features for regression * adjust tolerances in checks

[tests][dask] Create an informative categorical feature (#4113)
* make one categorical variable informative. increase n_samples. reduce n_features for regression * adjust tolerances in checks
8cc6eefc · jmoralez · GitHub · a45ed16f · 8cc6eefc
Unverified Commit 8cc6eefc authored Mar 26, 2021 by jmoralez Committed by GitHub Mar 26, 2021
Hide whitespace changes
Inline Side-by-side

Showing with 21 additions and 25 deletions

tests/python_package_test/test_dask.py tests/python_package_test/test_dask.py +21 -25

No files found.
--- a/tests/python_package_test/test_dask.py
+++ b/tests/python_package_test/test_dask.py
@@ -131,7 +131,7 @@ def _create_ranking_data(n_samples=100, output='array', chunk_size=50, **kwargs)
    return X, y, w, g_rle, dX, dy, dw, dg
-def _create_data(objective, n_samples=100, output='array', chunk_size=50, **kwargs):
+def _create_data(objective, n_samples=1_000, output='array', chunk_size=500, **kwargs):
    if objective.endswith('classification'):
        if objective == 'binary-classification':
            centers = [[-4, -4], [4, 4]]
@@ -141,7 +141,7 @@ def _create_data(objective, n_samples=100, output='array', chunk_size=50, **kwar
            raise ValueError(f"Unknown classification task '{objective}'")
        X, y = make_blobs(n_samples=n_samples, centers=centers, random_state=42)
    elif objective == 'regression':
-        X, y = make_regression(n_samples=n_samples, random_state=42)
+        X, y = make_regression(n_samples=n_samples, n_features=4, n_informative=2, random_state=42)
    elif objective == 'ranking':
        return _create_ranking_data(
            n_samples=n_samples,
@@ -161,7 +161,7 @@ def _create_data(objective, n_samples=100, output='array', chunk_size=50, **kwar
    elif output.startswith('dataframe'):
        X_df = pd.DataFrame(X, columns=['feature_%d' % i for i in range(X.shape[1])])
        if output == 'dataframe-with-categorical':
-            num_cat_cols = 5
+            num_cat_cols = 2
            for i in range(num_cat_cols):
                col_name = "cat_col" + str(i)
                cat_values = rnd.choice(['a', 'b'], X.shape[0])
@@ -172,13 +172,15 @@ def _create_data(objective, n_samples=100, output='array', chunk_size=50, **kwar
                X_df[col_name] = cat_series
                X = np.hstack((X, cat_series.cat.codes.values.reshape(-1, 1)))
-            # for the small data sizes used in tests, it's hard to get LGBMRegressor to choose
+            # make one categorical feature relevant to the target
-            # categorical features for splits. So for regression tests with categorical features,
+            cat_col_is_a = X_df['cat_col0'] == 'a'
-            # _create_data() returns a DataFrame with ONLY categorical features
            if objective == 'regression':
-                cat_cols = [col for col in X_df.columns if col.startswith('cat_col')]
+                y = np.where(cat_col_is_a, y, 2 * y)
-                X_df = X_df[cat_cols]
+            elif objective == 'binary-classification':
-                X = X[:, -num_cat_cols:]
+                y = np.where(cat_col_is_a, y, 1 - y)
+            elif objective == 'multiclass-classification':
+                n_classes = 3
+                y = np.where(cat_col_is_a, y, (1 + y) % n_classes)
        y_df = pd.Series(y, name='target')
        dX = dd.from_pandas(X_df, chunksize=chunk_size)
        dy = dd.from_pandas(y_df, chunksize=chunk_size)
@@ -238,8 +240,8 @@ def test_classifier(output, task, client):
    )
    params = {
-        "n_estimators": 10,
+        "n_estimators": 50,
-        "num_leaves": 10
+        "num_leaves": 31
    }
    dask_classifier = lgb.DaskLGBMClassifier(
@@ -265,7 +267,7 @@ def test_classifier(output, task, client):
    assert_eq(p1, p2)
    assert_eq(y, p1)
    assert_eq(y, p2)
-    assert_eq(p1_proba, p2_proba, atol=0.3)
+    assert_eq(p1_proba, p2_proba, atol=0.01)
    assert_eq(p1_local, p2)
    assert_eq(y, p1_local)
@@ -407,7 +409,8 @@ def test_regressor(output, client):
    params = {
        "random_state": 42,
-        "num_leaves": 10
+        "num_leaves": 31,
+        "n_estimators": 20,
    }
    dask_regressor = lgb.DaskLGBMRegressor(
@@ -420,8 +423,7 @@ def test_regressor(output, client):
    p1 = dask_regressor.predict(dX)
    p1_pred_leaf = dask_regressor.predict(dX, pred_leaf=True)
-    if not output.startswith('dataframe'):
+    s1 = _r2_score(dy, p1)
-        s1 = _r2_score(dy, p1)
    p1 = p1.compute()
    p1_local = dask_regressor.to_local().predict(X)
    s1_local = dask_regressor.to_local().score(X, y)
@@ -432,9 +434,8 @@ def test_regressor(output, client):
    p2 = local_regressor.predict(X)
    # Scores should be the same
-    if not output.startswith('dataframe'):
+    assert_eq(s1, s2, atol=0.01)
-        assert_eq(s1, s2, atol=.01)
+    assert_eq(s1, s1_local)
-        assert_eq(s1, s1_local, atol=.003)
    # Predictions should be roughly the same.
    assert_eq(p1, p1_local)
@@ -450,13 +451,8 @@ def test_regressor(output, client):
    assert np.min(pred_leaf_vals) >= 0
    assert len(np.unique(pred_leaf_vals)) <= params['num_leaves']
-    # The checks below are skipped
+    assert_eq(p1, y, rtol=0.5, atol=50.)
-    # for the categorical data case because it's difficult to get
+    assert_eq(p2, y, rtol=0.5, atol=50.)
-    # a good fit from just categoricals for a regression problem
-    # with small data
-    if output != 'dataframe-with-categorical':
-        assert_eq(y, p1, rtol=1., atol=100.)
-        assert_eq(y, p2, rtol=1., atol=50.)
    # be sure LightGBM actually used at least one categorical column,
    # and that it was correctly treated as a categorical feature