[tests][dask] simplify code in Dask tests (#4075)

* simplify Dask tests code * enable CI * disable CI

[tests][dask] simplify code in Dask tests (#4075)
* simplify Dask tests code * enable CI * disable CI
1f4a0842 · Nikita Titov · GitHub · 39c85dd9 · 1f4a0842
Unverified Commit 1f4a0842 authored Mar 16, 2021 by Nikita Titov Committed by GitHub Mar 15, 2021
Show whitespace changes
Inline Side-by-side

Showing with 210 additions and 257 deletions

tests/python_package_test/test_dask.py tests/python_package_test/test_dask.py +210 -257

No files found.
--- a/tests/python_package_test/test_dask.py
+++ b/tests/python_package_test/test_dask.py
@@ -131,7 +131,7 @@ def _create_ranking_data(n_samples=100, output='array', chunk_size=50, **kwargs)
    return X, y, w, g_rle, dX, dy, dw, dg


-def _create_data(objective, n_samples=100, output='array', chunk_size=50):
+def _create_data(objective, n_samples=100, output='array', chunk_size=50, **kwargs):
    if objective.endswith('classification'):
        if objective == 'binary-classification':
            centers = [[-4, -4], [4, 4]]
@@ -142,6 +142,13 @@ def _create_data(objective, n_samples=100, output='array', chunk_size=50):
        X, y = make_blobs(n_samples=n_samples, centers=centers, random_state=42)
    elif objective == 'regression':
        X, y = make_regression(n_samples=n_samples, random_state=42)
+    elif objective == 'ranking':
+        return _create_ranking_data(
+            n_samples=n_samples,
+            output=output,
+            chunk_size=chunk_size,
+            **kwargs
+        )
    else:
        raise ValueError("Unknown objective '%s'" % objective)
    rnd = np.random.RandomState(42)
@@ -183,7 +190,7 @@ def _create_data(objective, n_samples=100, output='array', chunk_size=50):
    else:
        raise ValueError("Unknown output type '%s'" % output)

-    return X, y, weights, dX, dy, dw
+    return X, y, weights, None, dX, dy, dw, None


 def _r2_score(dy_true, dy_pred):
@@ -225,7 +232,7 @@ def _unpickle(filepath, serializer):
 @pytest.mark.parametrize('output', data_output)
 @pytest.mark.parametrize('task', ['binary-classification', 'multiclass-classification'])
 def test_classifier(output, task, client):
-    X, y, w, dX, dy, dw = _create_data(
+    X, y, w, _, dX, dy, dw, _ = _create_data(
        objective=task,
        output=output
    )
@@ -291,7 +298,7 @@ def test_classifier(output, task, client):
 @pytest.mark.parametrize('output', data_output)
 @pytest.mark.parametrize('task', ['binary-classification', 'multiclass-classification'])
 def test_classifier_pred_contrib(output, task, client):
-    X, y, w, dX, dy, dw = _create_data(
+    X, y, w, _, dX, dy, dw, _ = _create_data(
        objective=task,
        output=output
    )
@@ -369,7 +376,7 @@ def test_find_random_open_port(client):


 def test_training_does_not_fail_on_port_conflicts(client):
-    _, _, _, dX, dy, dw = _create_data('binary-classification', output='array')
+    _, _, _, _, dX, dy, dw, _ = _create_data('binary-classification', output='array')

    lightgbm_default_port = 12400
    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
@@ -393,7 +400,7 @@ def test_training_does_not_fail_on_port_conflicts(client):

 @pytest.mark.parametrize('output', data_output)
 def test_regressor(output, client):
-    X, y, w, dX, dy, dw = _create_data(
+    X, y, w, _, dX, dy, dw, _ = _create_data(
        objective='regression',
        output=output
    )
@@ -468,7 +475,7 @@ def test_regressor(output, client):

 @pytest.mark.parametrize('output', data_output)
 def test_regressor_pred_contrib(output, client):
-    X, y, w, dX, dy, dw = _create_data(
+    X, y, w, _, dX, dy, dw, _ = _create_data(
        objective='regression',
        output=output
    )
@@ -518,7 +525,7 @@ def test_regressor_pred_contrib(output, client):
 @pytest.mark.parametrize('output', data_output)
 @pytest.mark.parametrize('alpha', [.1, .5, .9])
 def test_regressor_quantile(output, client, alpha):
-    X, y, w, dX, dy, dw = _create_data(
+    X, y, w, _, dX, dy, dw, _ = _create_data(
        objective='regression',
        output=output
    )
@@ -567,18 +574,19 @@ def test_regressor_quantile(output, client, alpha):
 @pytest.mark.parametrize('output', ['array', 'dataframe', 'dataframe-with-categorical'])
 @pytest.mark.parametrize('group', [None, group_sizes])
 def test_ranker(output, client, group):
-
    if output == 'dataframe-with-categorical':
-        X, y, w, g, dX, dy, dw, dg = _create_ranking_data(
+        X, y, w, g, dX, dy, dw, dg = _create_data(
+            objective='ranking',
            output=output,
            group=group,
            n_features=1,
            n_informative=1
        )
    else:
-        X, y, w, g, dX, dy, dw, dg = _create_ranking_data(
+        X, y, w, g, dX, dy, dw, dg = _create_data(
+            objective='ranking',
            output=output,
-            group=group,
+            group=group
        )

    # rebalance small dask.Array dataset for better performance.
@@ -650,17 +658,11 @@ def test_ranker(output, client, group):

 @pytest.mark.parametrize('task', tasks)
 def test_training_works_if_client_not_provided_or_set_after_construction(task, client):
-    if task == 'ranking':
-        _, _, _, _, dX, dy, _, dg = _create_ranking_data(
-            output='array',
-            group=None
-        )
-    else:
-        _, _, _, dX, dy, _ = _create_data(
+    _, _, _, _, dX, dy, _, dg = _create_data(
        objective=task,
        output='array',
+        group=None
    )
-        dg = None
    model_factory = task_to_dask_factory[task]

    params = {
@@ -723,37 +725,21 @@ def test_training_works_if_client_not_provided_or_set_after_construction(task, c
 @pytest.mark.parametrize('set_client', [True, False])
 def test_model_and_local_version_are_picklable_whether_or_not_client_set_explicitly(serializer, task, set_client, tmp_path):

-    with LocalCluster(n_workers=2, threads_per_worker=1) as cluster1:
-        with Client(cluster1) as client1:
-
+    with LocalCluster(n_workers=2, threads_per_worker=1) as cluster1, Client(cluster1) as client1:
        # data on cluster1
-            if task == 'ranking':
-                X_1, _, _, _, dX_1, dy_1, _, dg_1 = _create_ranking_data(
-                    output='array',
-                    group=None
-                )
-            else:
-                X_1, _, _, dX_1, dy_1, _ = _create_data(
+        X_1, _, _, _, dX_1, dy_1, _, dg_1 = _create_data(
            objective=task,
            output='array',
+            group=None
        )
-                dg_1 = None
-
-            with LocalCluster(n_workers=2, threads_per_worker=1) as cluster2:
-                with Client(cluster2) as client2:

+        with LocalCluster(n_workers=2, threads_per_worker=1) as cluster2, Client(cluster2) as client2:
            # create identical data on cluster2
-                    if task == 'ranking':
-                        X_2, _, _, _, dX_2, dy_2, _, dg_2 = _create_ranking_data(
-                            output='array',
-                            group=None
-                        )
-                    else:
-                        X_2, _, _, dX_2, dy_2, _ = _create_data(
+            X_2, _, _, _, dX_2, dy_2, _, dg_2 = _create_data(
                objective=task,
                output='array',
+                group=None
            )
-                        dg_2 = None

            model_factory = task_to_dask_factory[task]

@@ -971,18 +957,11 @@ def test_training_succeeds_even_if_some_workers_do_not_have_any_data(client, tas
            return collection.rechunk(*collection.shape)
        return collection.repartition(npartitions=1)

-    if task == 'ranking':
-        X, y, w, g, dX, dy, dw, dg = _create_ranking_data(
+    X, y, w, g, dX, dy, dw, dg = _create_data(
+        objective=task,
        output=output,
        group=None
    )
-    else:
-        X, y, w, dX, dy, dw = _create_data(
-            objective=task,
-            output=output
-        )
-        g = None
-        dg = None

    dask_model_factory = task_to_dask_factory[task]
    local_model_factory = task_to_local_factory[task]
@@ -1026,19 +1005,12 @@ def test_network_params_not_required_but_respected_if_given(client, task, output

    client.wait_for_workers(2)

-    if task == 'ranking':
-        _, _, _, _, dX, dy, _, dg = _create_ranking_data(
-            output=output,
-            group=None,
-            chunk_size=10,
-        )
-    else:
-        _, _, _, dX, dy, _ = _create_data(
+    _, _, _, _, dX, dy, _, dg = _create_data(
        objective=task,
        output=output,
        chunk_size=10,
+        group=None
    )
-        dg = None

    dask_model_factory = task_to_dask_factory[task]

@@ -1097,19 +1069,12 @@ def test_machines_should_be_used_if_provided(task, output):
        pytest.skip('LGBMRanker is not currently tested on sparse matrices')

    with LocalCluster(n_workers=2) as cluster, Client(cluster) as client:
-        if task == 'ranking':
-            _, _, _, _, dX, dy, _, dg = _create_ranking_data(
-                output=output,
-                group=None,
-                chunk_size=10,
-            )
-        else:
-            _, _, _, dX, dy, _ = _create_data(
+        _, _, _, _, dX, dy, _, dg = _create_data(
            objective=task,
            output=output,
            chunk_size=10,
+            group=None
        )
-            dg = None

        dask_model_factory = task_to_dask_factory[task]

@@ -1205,17 +1170,11 @@ def test_training_succeeds_when_data_is_dataframe_and_label_is_column_array(
    task,
    client,
 ):
-    if task == 'ranking':
-        _, _, _, _, dX, dy, dw, dg = _create_ranking_data(
-            output='dataframe',
-            group=None
-        )
-    else:
-        _, _, _, dX, dy, dw = _create_data(
+    _, _, _, _, dX, dy, dw, dg = _create_data(
        objective=task,
        output='dataframe',
+        group=None
    )
-        dg = None

    model_factory = task_to_dask_factory[task]

@@ -1242,17 +1201,11 @@ def test_init_score(task, output, client):
    if task == 'ranking' and output == 'scipy_csr_matrix':
        pytest.skip('LGBMRanker is not currently tested on sparse matrices')

-    if task == 'ranking':
-        _, _, _, _, dX, dy, dw, dg = _create_ranking_data(
-            output=output,
-            group=None
-        )
-    else:
-        _, _, _, dX, dy, dw = _create_data(
+    _, _, _, _, dX, dy, dw, dg = _create_data(
        objective=task,
        output=output,
+        group=None
    )
-        dg = None

    model_factory = task_to_dask_factory[task]