Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
tianlh
LightGBM-DCU
Commits
1f4a0842
Unverified
Commit
1f4a0842
authored
Mar 16, 2021
by
Nikita Titov
Committed by
GitHub
Mar 15, 2021
Browse files
[tests][dask] simplify code in Dask tests (#4075)
* simplify Dask tests code * enable CI * disable CI
parent
39c85dd9
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
210 additions
and
257 deletions
+210
-257
tests/python_package_test/test_dask.py
tests/python_package_test/test_dask.py
+210
-257
No files found.
tests/python_package_test/test_dask.py
View file @
1f4a0842
...
...
@@ -131,7 +131,7 @@ def _create_ranking_data(n_samples=100, output='array', chunk_size=50, **kwargs)
return
X
,
y
,
w
,
g_rle
,
dX
,
dy
,
dw
,
dg
def
_create_data
(
objective
,
n_samples
=
100
,
output
=
'array'
,
chunk_size
=
50
):
def
_create_data
(
objective
,
n_samples
=
100
,
output
=
'array'
,
chunk_size
=
50
,
**
kwargs
):
if
objective
.
endswith
(
'classification'
):
if
objective
==
'binary-classification'
:
centers
=
[[
-
4
,
-
4
],
[
4
,
4
]]
...
...
@@ -142,6 +142,13 @@ def _create_data(objective, n_samples=100, output='array', chunk_size=50):
X
,
y
=
make_blobs
(
n_samples
=
n_samples
,
centers
=
centers
,
random_state
=
42
)
elif
objective
==
'regression'
:
X
,
y
=
make_regression
(
n_samples
=
n_samples
,
random_state
=
42
)
elif
objective
==
'ranking'
:
return
_create_ranking_data
(
n_samples
=
n_samples
,
output
=
output
,
chunk_size
=
chunk_size
,
**
kwargs
)
else
:
raise
ValueError
(
"Unknown objective '%s'"
%
objective
)
rnd
=
np
.
random
.
RandomState
(
42
)
...
...
@@ -183,7 +190,7 @@ def _create_data(objective, n_samples=100, output='array', chunk_size=50):
else
:
raise
ValueError
(
"Unknown output type '%s'"
%
output
)
return
X
,
y
,
weights
,
dX
,
dy
,
dw
return
X
,
y
,
weights
,
None
,
dX
,
dy
,
dw
,
None
def
_r2_score
(
dy_true
,
dy_pred
):
...
...
@@ -225,7 +232,7 @@ def _unpickle(filepath, serializer):
@
pytest
.
mark
.
parametrize
(
'output'
,
data_output
)
@
pytest
.
mark
.
parametrize
(
'task'
,
[
'binary-classification'
,
'multiclass-classification'
])
def
test_classifier
(
output
,
task
,
client
):
X
,
y
,
w
,
dX
,
dy
,
dw
=
_create_data
(
X
,
y
,
w
,
_
,
dX
,
dy
,
dw
,
_
=
_create_data
(
objective
=
task
,
output
=
output
)
...
...
@@ -291,7 +298,7 @@ def test_classifier(output, task, client):
@
pytest
.
mark
.
parametrize
(
'output'
,
data_output
)
@
pytest
.
mark
.
parametrize
(
'task'
,
[
'binary-classification'
,
'multiclass-classification'
])
def
test_classifier_pred_contrib
(
output
,
task
,
client
):
X
,
y
,
w
,
dX
,
dy
,
dw
=
_create_data
(
X
,
y
,
w
,
_
,
dX
,
dy
,
dw
,
_
=
_create_data
(
objective
=
task
,
output
=
output
)
...
...
@@ -369,7 +376,7 @@ def test_find_random_open_port(client):
def
test_training_does_not_fail_on_port_conflicts
(
client
):
_
,
_
,
_
,
dX
,
dy
,
dw
=
_create_data
(
'binary-classification'
,
output
=
'array'
)
_
,
_
,
_
,
_
,
dX
,
dy
,
dw
,
_
=
_create_data
(
'binary-classification'
,
output
=
'array'
)
lightgbm_default_port
=
12400
with
socket
.
socket
(
socket
.
AF_INET
,
socket
.
SOCK_STREAM
)
as
s
:
...
...
@@ -393,7 +400,7 @@ def test_training_does_not_fail_on_port_conflicts(client):
@
pytest
.
mark
.
parametrize
(
'output'
,
data_output
)
def
test_regressor
(
output
,
client
):
X
,
y
,
w
,
dX
,
dy
,
dw
=
_create_data
(
X
,
y
,
w
,
_
,
dX
,
dy
,
dw
,
_
=
_create_data
(
objective
=
'regression'
,
output
=
output
)
...
...
@@ -468,7 +475,7 @@ def test_regressor(output, client):
@
pytest
.
mark
.
parametrize
(
'output'
,
data_output
)
def
test_regressor_pred_contrib
(
output
,
client
):
X
,
y
,
w
,
dX
,
dy
,
dw
=
_create_data
(
X
,
y
,
w
,
_
,
dX
,
dy
,
dw
,
_
=
_create_data
(
objective
=
'regression'
,
output
=
output
)
...
...
@@ -518,7 +525,7 @@ def test_regressor_pred_contrib(output, client):
@
pytest
.
mark
.
parametrize
(
'output'
,
data_output
)
@
pytest
.
mark
.
parametrize
(
'alpha'
,
[.
1
,
.
5
,
.
9
])
def
test_regressor_quantile
(
output
,
client
,
alpha
):
X
,
y
,
w
,
dX
,
dy
,
dw
=
_create_data
(
X
,
y
,
w
,
_
,
dX
,
dy
,
dw
,
_
=
_create_data
(
objective
=
'regression'
,
output
=
output
)
...
...
@@ -567,18 +574,19 @@ def test_regressor_quantile(output, client, alpha):
@
pytest
.
mark
.
parametrize
(
'output'
,
[
'array'
,
'dataframe'
,
'dataframe-with-categorical'
])
@
pytest
.
mark
.
parametrize
(
'group'
,
[
None
,
group_sizes
])
def
test_ranker
(
output
,
client
,
group
):
if
output
==
'dataframe-with-categorical'
:
X
,
y
,
w
,
g
,
dX
,
dy
,
dw
,
dg
=
_create_ranking_data
(
X
,
y
,
w
,
g
,
dX
,
dy
,
dw
,
dg
=
_create_data
(
objective
=
'ranking'
,
output
=
output
,
group
=
group
,
n_features
=
1
,
n_informative
=
1
)
else
:
X
,
y
,
w
,
g
,
dX
,
dy
,
dw
,
dg
=
_create_ranking_data
(
X
,
y
,
w
,
g
,
dX
,
dy
,
dw
,
dg
=
_create_data
(
objective
=
'ranking'
,
output
=
output
,
group
=
group
,
group
=
group
)
# rebalance small dask.Array dataset for better performance.
...
...
@@ -650,17 +658,11 @@ def test_ranker(output, client, group):
@
pytest
.
mark
.
parametrize
(
'task'
,
tasks
)
def
test_training_works_if_client_not_provided_or_set_after_construction
(
task
,
client
):
if
task
==
'ranking'
:
_
,
_
,
_
,
_
,
dX
,
dy
,
_
,
dg
=
_create_ranking_data
(
output
=
'array'
,
group
=
None
)
else
:
_
,
_
,
_
,
dX
,
dy
,
_
=
_create_data
(
_
,
_
,
_
,
_
,
dX
,
dy
,
_
,
dg
=
_create_data
(
objective
=
task
,
output
=
'array'
,
group
=
None
)
dg
=
None
model_factory
=
task_to_dask_factory
[
task
]
params
=
{
...
...
@@ -723,37 +725,21 @@ def test_training_works_if_client_not_provided_or_set_after_construction(task, c
@
pytest
.
mark
.
parametrize
(
'set_client'
,
[
True
,
False
])
def
test_model_and_local_version_are_picklable_whether_or_not_client_set_explicitly
(
serializer
,
task
,
set_client
,
tmp_path
):
with
LocalCluster
(
n_workers
=
2
,
threads_per_worker
=
1
)
as
cluster1
:
with
Client
(
cluster1
)
as
client1
:
with
LocalCluster
(
n_workers
=
2
,
threads_per_worker
=
1
)
as
cluster1
,
Client
(
cluster1
)
as
client1
:
# data on cluster1
if
task
==
'ranking'
:
X_1
,
_
,
_
,
_
,
dX_1
,
dy_1
,
_
,
dg_1
=
_create_ranking_data
(
output
=
'array'
,
group
=
None
)
else
:
X_1
,
_
,
_
,
dX_1
,
dy_1
,
_
=
_create_data
(
X_1
,
_
,
_
,
_
,
dX_1
,
dy_1
,
_
,
dg_1
=
_create_data
(
objective
=
task
,
output
=
'array'
,
group
=
None
)
dg_1
=
None
with
LocalCluster
(
n_workers
=
2
,
threads_per_worker
=
1
)
as
cluster2
:
with
Client
(
cluster2
)
as
client2
:
with
LocalCluster
(
n_workers
=
2
,
threads_per_worker
=
1
)
as
cluster2
,
Client
(
cluster2
)
as
client2
:
# create identical data on cluster2
if
task
==
'ranking'
:
X_2
,
_
,
_
,
_
,
dX_2
,
dy_2
,
_
,
dg_2
=
_create_ranking_data
(
output
=
'array'
,
group
=
None
)
else
:
X_2
,
_
,
_
,
dX_2
,
dy_2
,
_
=
_create_data
(
X_2
,
_
,
_
,
_
,
dX_2
,
dy_2
,
_
,
dg_2
=
_create_data
(
objective
=
task
,
output
=
'array'
,
group
=
None
)
dg_2
=
None
model_factory
=
task_to_dask_factory
[
task
]
...
...
@@ -971,18 +957,11 @@ def test_training_succeeds_even_if_some_workers_do_not_have_any_data(client, tas
return
collection
.
rechunk
(
*
collection
.
shape
)
return
collection
.
repartition
(
npartitions
=
1
)
if
task
==
'ranking'
:
X
,
y
,
w
,
g
,
dX
,
dy
,
dw
,
dg
=
_create_ranking_data
(
X
,
y
,
w
,
g
,
dX
,
dy
,
dw
,
dg
=
_create_data
(
objective
=
task
,
output
=
output
,
group
=
None
)
else
:
X
,
y
,
w
,
dX
,
dy
,
dw
=
_create_data
(
objective
=
task
,
output
=
output
)
g
=
None
dg
=
None
dask_model_factory
=
task_to_dask_factory
[
task
]
local_model_factory
=
task_to_local_factory
[
task
]
...
...
@@ -1026,19 +1005,12 @@ def test_network_params_not_required_but_respected_if_given(client, task, output
client
.
wait_for_workers
(
2
)
if
task
==
'ranking'
:
_
,
_
,
_
,
_
,
dX
,
dy
,
_
,
dg
=
_create_ranking_data
(
output
=
output
,
group
=
None
,
chunk_size
=
10
,
)
else
:
_
,
_
,
_
,
dX
,
dy
,
_
=
_create_data
(
_
,
_
,
_
,
_
,
dX
,
dy
,
_
,
dg
=
_create_data
(
objective
=
task
,
output
=
output
,
chunk_size
=
10
,
group
=
None
)
dg
=
None
dask_model_factory
=
task_to_dask_factory
[
task
]
...
...
@@ -1097,19 +1069,12 @@ def test_machines_should_be_used_if_provided(task, output):
pytest
.
skip
(
'LGBMRanker is not currently tested on sparse matrices'
)
with
LocalCluster
(
n_workers
=
2
)
as
cluster
,
Client
(
cluster
)
as
client
:
if
task
==
'ranking'
:
_
,
_
,
_
,
_
,
dX
,
dy
,
_
,
dg
=
_create_ranking_data
(
output
=
output
,
group
=
None
,
chunk_size
=
10
,
)
else
:
_
,
_
,
_
,
dX
,
dy
,
_
=
_create_data
(
_
,
_
,
_
,
_
,
dX
,
dy
,
_
,
dg
=
_create_data
(
objective
=
task
,
output
=
output
,
chunk_size
=
10
,
group
=
None
)
dg
=
None
dask_model_factory
=
task_to_dask_factory
[
task
]
...
...
@@ -1205,17 +1170,11 @@ def test_training_succeeds_when_data_is_dataframe_and_label_is_column_array(
task
,
client
,
):
if
task
==
'ranking'
:
_
,
_
,
_
,
_
,
dX
,
dy
,
dw
,
dg
=
_create_ranking_data
(
output
=
'dataframe'
,
group
=
None
)
else
:
_
,
_
,
_
,
dX
,
dy
,
dw
=
_create_data
(
_
,
_
,
_
,
_
,
dX
,
dy
,
dw
,
dg
=
_create_data
(
objective
=
task
,
output
=
'dataframe'
,
group
=
None
)
dg
=
None
model_factory
=
task_to_dask_factory
[
task
]
...
...
@@ -1242,17 +1201,11 @@ def test_init_score(task, output, client):
if
task
==
'ranking'
and
output
==
'scipy_csr_matrix'
:
pytest
.
skip
(
'LGBMRanker is not currently tested on sparse matrices'
)
if
task
==
'ranking'
:
_
,
_
,
_
,
_
,
dX
,
dy
,
dw
,
dg
=
_create_ranking_data
(
output
=
output
,
group
=
None
)
else
:
_
,
_
,
_
,
dX
,
dy
,
dw
=
_create_data
(
_
,
_
,
_
,
_
,
dX
,
dy
,
dw
,
dg
=
_create_data
(
objective
=
task
,
output
=
output
,
group
=
None
)
dg
=
None
model_factory
=
task_to_dask_factory
[
task
]
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment