[tests][python-package] change boston dataset to synthetic dataset in tests...

[tests][python-package] change boston dataset to synthetic dataset in tests that don't check score (#4895) * change boston dataset to synthetic dataset in tests that don't evaluate score * format imports

[tests][python-package] change boston dataset to synthetic dataset in tests...
[tests][python-package] change boston dataset to synthetic dataset in tests that don't check score (#4895) * change boston dataset to synthetic dataset in tests that don't evaluate score * format imports
8a34b1af · José Morales · GitHub · 8e729af3 · 8a34b1af · 8a34b1af
Unverified Commit 8a34b1af authored Dec 20, 2021 by José Morales Committed by GitHub Dec 21, 2021
3 changed files
--- a/tests/python_package_test/test_engine.py
+++ b/tests/python_package_test/test_engine.py
@@ -18,7 +18,7 @@ from sklearn.model_selection import GroupKFold, TimeSeriesSplit, train_test_spli

 import lightgbm as lgb

-from .utils import load_boston, load_breast_cancer, load_digits, load_iris
+from .utils import load_boston, load_breast_cancer, load_digits, load_iris, make_synthetic_regression

 decreasing_generator = itertools.count(0, -1)

@@ -731,7 +731,7 @@ def test_continue_train():


 def test_continue_train_reused_dataset():
-    X, y = load_boston(return_X_y=True)
+    X, y = make_synthetic_regression()
    params = {
        'objective': 'regression',
        'verbose': -1
@@ -791,7 +791,7 @@ def test_continue_train_multiclass():


 def test_cv():
-    X_train, y_train = load_boston(return_X_y=True)
+    X_train, y_train = make_synthetic_regression()
    params = {'verbose': -1}
    lgb_train = lgb.Dataset(X_train, y_train)
    # shuffle = False, override metric in params
@@ -887,7 +887,7 @@ def test_cvbooster():


 def test_feature_name():
-    X_train, y_train = load_boston(return_X_y=True)
+    X_train, y_train = make_synthetic_regression()
    params = {'verbose': -1}
    lgb_train = lgb.Dataset(X_train, y_train)
    feature_names = [f'f_{i}' for i in range(X_train.shape[-1])]
@@ -917,7 +917,7 @@ def test_feature_name_with_non_ascii():

 def test_save_load_copy_pickle():
    def train_and_predict(init_model=None, return_model=False):
-        X, y = load_boston(return_X_y=True)
+        X, y = make_synthetic_regression()
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
        params = {
            'objective': 'regression',
@@ -2102,7 +2102,7 @@ def test_default_objective_and_metric():

 @pytest.mark.skipif(psutil.virtual_memory().available / 1024 / 1024 / 1024 < 3, reason='not enough RAM')
 def test_model_size():
-    X, y = load_boston(return_X_y=True)
+    X, y = make_synthetic_regression()
    data = lgb.Dataset(X, y)
    bst = lgb.train({'verbose': -1}, data, num_boost_round=2)
    y_pred = bst.predict(X)
@@ -2515,7 +2515,7 @@ def test_dataset_params_with_reference():

 def test_extra_trees():
    # check extra trees increases regularization
-    X, y = load_boston(return_X_y=True)
+    X, y = make_synthetic_regression()
    lgb_x = lgb.Dataset(X, label=y)
    params = {'objective': 'regression',
              'num_leaves': 32,
@@ -2534,7 +2534,7 @@ def test_extra_trees():

 def test_path_smoothing():
    # check path smoothing increases regularization
-    X, y = load_boston(return_X_y=True)
+    X, y = make_synthetic_regression()
    lgb_x = lgb.Dataset(X, label=y)
    params = {'objective': 'regression',
              'num_leaves': 32,
@@ -2804,7 +2804,7 @@ def test_predict_with_start_iteration():
        np.testing.assert_allclose(pred4, pred6)

    # test for regression
-    X, y = load_boston(return_X_y=True)
+    X, y = make_synthetic_regression()
    params = {
        'objective': 'regression',
        'verbose': -1,

--- a/tests/python_package_test/test_sklearn.py
+++ b/tests/python_package_test/test_sklearn.py
@@ -18,7 +18,8 @@ from sklearn.utils.validation import check_is_fitted

 import lightgbm as lgb

-from .utils import load_boston, load_breast_cancer, load_digits, load_iris, load_linnerud, make_ranking
+from .utils import (load_boston, load_breast_cancer, load_digits, load_iris, load_linnerud, make_ranking,
+                    make_synthetic_regression)

 sk_version = parse_version(sk_version)
 if sk_version < parse_version("0.23"):
@@ -184,7 +185,7 @@ def test_eval_at_aliases():

 @pytest.mark.parametrize("custom_objective", [True, False])
 def test_objective_aliases(custom_objective):
-    X, y = load_boston(return_X_y=True)
+    X, y = make_synthetic_regression()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
    if custom_objective:
        obj = custom_dummy_obj
@@ -440,7 +441,7 @@ def test_regressor_chain():


 def test_clone_and_property():
-    X, y = load_boston(return_X_y=True)
+    X, y = make_synthetic_regression()
    gbm = lgb.LGBMRegressor(n_estimators=10, verbose=-1)
    gbm.fit(X, y)

@@ -458,7 +459,7 @@ def test_clone_and_property():


 def test_joblib():
-    X, y = load_boston(return_X_y=True)
+    X, y = make_synthetic_regression()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
    gbm = lgb.LGBMRegressor(n_estimators=10, objective=custom_asymmetric_obj,
                            verbose=-1, importance_type='split')
@@ -499,7 +500,7 @@ def test_non_serializable_objects_in_callbacks(tmp_path):
    with pytest.raises(Exception, match="This class in not picklable"):
        joblib.dump(unpicklable_callback, tmp_path / 'tmp.joblib')

-    X, y = load_boston(return_X_y=True)
+    X, y = make_synthetic_regression()
    gbm = lgb.LGBMRegressor(n_estimators=5)
    gbm.fit(X, y, callbacks=[unpicklable_callback])
    assert gbm.booster_.attr('attr_set_inside_callback') == '40'
@@ -757,7 +758,7 @@ def test_predict_with_params_from_init():


 def test_evaluate_train_set():
-    X, y = load_boston(return_X_y=True)
+    X, y = make_synthetic_regression()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
    gbm = lgb.LGBMRegressor(n_estimators=10, verbose=-1)
    gbm.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)])
@@ -1332,7 +1333,7 @@ def test_training_succeeds_when_data_is_dataframe_and_label_is_column_array(task
        X, y = load_iris(return_X_y=True)
        model_factory = lgb.LGBMClassifier
    elif task == 'regression':
-        X, y = load_boston(return_X_y=True)
+        X, y = make_synthetic_regression()
        model_factory = lgb.LGBMRegressor
    X = pd.DataFrame(X)
    y_col_array = y.reshape(-1, 1)

--- a/tests/python_package_test/utils.py
+++ b/tests/python_package_test/utils.py
@@ -109,3 +109,8 @@ def make_ranking(n_samples=100, n_features=20, n_informative=5, gmax=2,
        X[:, j] = bias + coef * y_vec

    return X, y_vec, group_id_vec
+
+
+@lru_cache(maxsize=None)
+def make_synthetic_regression(n_samples=100):
+    return sklearn.datasets.make_regression(n_samples, n_features=4, n_informative=2, random_state=42)