Unverified Commit d517ba12 authored by jmoralez's avatar jmoralez Committed by GitHub
Browse files

[tests][dask] Add voting_parallel algorithm in tests (fixes #3834) (#4088)

* include voting_parallel tree_learner in test_regressor, test_classifier and test_ranker

* remove test for warnings and test for error when using feature_parallel

* use real names for tree_learner intest and include test for aliases. use the error message in the test for error in feature parallel

* split all tests with rf in test_classifier

* remove task parametrization for tree_learner aliases test. smaller input data from feature_parallel error

* define task for tree_learner aliases
parent 46a20ab0
...@@ -309,12 +309,6 @@ def _train( ...@@ -309,12 +309,6 @@ def _train(
_log_warning('Parameter tree_learner set to %s, which is not allowed. Using "data" as default' % params['tree_learner']) _log_warning('Parameter tree_learner set to %s, which is not allowed. Using "data" as default' % params['tree_learner'])
params['tree_learner'] = 'data' params['tree_learner'] = 'data'
if params['tree_learner'] not in {'data', 'data_parallel'}:
_log_warning(
'Support for tree_learner %s in lightgbm.dask is experimental and may break in a future release. \n'
'Use "data" for a stable, well-tested interface.' % params['tree_learner']
)
# Some passed-in parameters can be removed: # Some passed-in parameters can be removed:
# * 'num_machines': set automatically from Dask worker list # * 'num_machines': set automatically from Dask worker list
# * 'num_threads': overridden to match nthreads on each Dask process # * 'num_threads': overridden to match nthreads on each Dask process
......
...@@ -44,6 +44,7 @@ sk_version = parse_version(sk_version) ...@@ -44,6 +44,7 @@ sk_version = parse_version(sk_version)
CLIENT_CLOSE_TIMEOUT = 120 CLIENT_CLOSE_TIMEOUT = 120
tasks = ['binary-classification', 'multiclass-classification', 'regression', 'ranking'] tasks = ['binary-classification', 'multiclass-classification', 'regression', 'ranking']
distributed_training_algorithms = ['data', 'voting']
data_output = ['array', 'scipy_csr_matrix', 'dataframe', 'dataframe-with-categorical'] data_output = ['array', 'scipy_csr_matrix', 'dataframe', 'dataframe-with-categorical']
boosting_types = ['gbdt', 'dart', 'goss', 'rf'] boosting_types = ['gbdt', 'dart', 'goss', 'rf']
group_sizes = [5, 5, 5, 10, 10, 10, 20, 20, 20, 50, 50] group_sizes = [5, 5, 5, 10, 10, 10, 20, 20, 20, 50, 50]
...@@ -235,7 +236,8 @@ def _unpickle(filepath, serializer): ...@@ -235,7 +236,8 @@ def _unpickle(filepath, serializer):
@pytest.mark.parametrize('output', data_output) @pytest.mark.parametrize('output', data_output)
@pytest.mark.parametrize('task', ['binary-classification', 'multiclass-classification']) @pytest.mark.parametrize('task', ['binary-classification', 'multiclass-classification'])
@pytest.mark.parametrize('boosting_type', boosting_types) @pytest.mark.parametrize('boosting_type', boosting_types)
def test_classifier(output, task, boosting_type, client): @pytest.mark.parametrize('tree_learner', distributed_training_algorithms)
def test_classifier(output, task, boosting_type, tree_learner, client):
X, y, w, _, dX, dy, dw, _ = _create_data( X, y, w, _, dX, dy, dw, _ = _create_data(
objective=task, objective=task,
output=output output=output
...@@ -243,6 +245,7 @@ def test_classifier(output, task, boosting_type, client): ...@@ -243,6 +245,7 @@ def test_classifier(output, task, boosting_type, client):
params = { params = {
"boosting_type": boosting_type, "boosting_type": boosting_type,
"tree_learner": tree_learner,
"n_estimators": 50, "n_estimators": 50,
"num_leaves": 31 "num_leaves": 31
} }
...@@ -273,7 +276,7 @@ def test_classifier(output, task, boosting_type, client): ...@@ -273,7 +276,7 @@ def test_classifier(output, task, boosting_type, client):
p2_proba = local_classifier.predict_proba(X) p2_proba = local_classifier.predict_proba(X)
s2 = local_classifier.score(X, y) s2 = local_classifier.score(X, y)
if boosting_type == 'rf' and output == 'dataframe-with-categorical': if boosting_type == 'rf':
# https://github.com/microsoft/LightGBM/issues/4118 # https://github.com/microsoft/LightGBM/issues/4118
assert_eq(s1, s2, atol=0.01) assert_eq(s1, s2, atol=0.01)
assert_eq(p1_proba, p2_proba, atol=0.8) assert_eq(p1_proba, p2_proba, atol=0.8)
...@@ -448,7 +451,8 @@ def test_training_does_not_fail_on_port_conflicts(client): ...@@ -448,7 +451,8 @@ def test_training_does_not_fail_on_port_conflicts(client):
@pytest.mark.parametrize('output', data_output) @pytest.mark.parametrize('output', data_output)
@pytest.mark.parametrize('boosting_type', boosting_types) @pytest.mark.parametrize('boosting_type', boosting_types)
def test_regressor(output, boosting_type, client): @pytest.mark.parametrize('tree_learner', distributed_training_algorithms)
def test_regressor(output, boosting_type, tree_learner, client):
X, y, w, _, dX, dy, dw, _ = _create_data( X, y, w, _, dX, dy, dw, _ = _create_data(
objective='regression', objective='regression',
output=output output=output
...@@ -469,7 +473,7 @@ def test_regressor(output, boosting_type, client): ...@@ -469,7 +473,7 @@ def test_regressor(output, boosting_type, client):
dask_regressor = lgb.DaskLGBMRegressor( dask_regressor = lgb.DaskLGBMRegressor(
client=client, client=client,
time_out=5, time_out=5,
tree='data', tree=tree_learner,
**params **params
) )
dask_regressor = dask_regressor.fit(dX, dy, sample_weight=dw) dask_regressor = dask_regressor.fit(dX, dy, sample_weight=dw)
...@@ -623,7 +627,8 @@ def test_regressor_quantile(output, client, alpha): ...@@ -623,7 +627,8 @@ def test_regressor_quantile(output, client, alpha):
@pytest.mark.parametrize('output', ['array', 'dataframe', 'dataframe-with-categorical']) @pytest.mark.parametrize('output', ['array', 'dataframe', 'dataframe-with-categorical'])
@pytest.mark.parametrize('group', [None, group_sizes]) @pytest.mark.parametrize('group', [None, group_sizes])
@pytest.mark.parametrize('boosting_type', boosting_types) @pytest.mark.parametrize('boosting_type', boosting_types)
def test_ranker(output, group, boosting_type, client): @pytest.mark.parametrize('tree_learner', distributed_training_algorithms)
def test_ranker(output, group, boosting_type, tree_learner, client):
if output == 'dataframe-with-categorical': if output == 'dataframe-with-categorical':
X, y, w, g, dX, dy, dw, dg = _create_data( X, y, w, g, dX, dy, dw, dg = _create_data(
objective='ranking', objective='ranking',
...@@ -666,7 +671,7 @@ def test_ranker(output, group, boosting_type, client): ...@@ -666,7 +671,7 @@ def test_ranker(output, group, boosting_type, client):
dask_ranker = lgb.DaskLGBMRanker( dask_ranker = lgb.DaskLGBMRanker(
client=client, client=client,
time_out=5, time_out=5,
tree_learner_type='data_parallel', tree_learner_type=tree_learner,
**params **params
) )
dask_ranker = dask_ranker.fit(dX, dy, sample_weight=dw, group=dg) dask_ranker = dask_ranker.fit(dX, dy, sample_weight=dw, group=dg)
...@@ -961,23 +966,37 @@ def test_warns_and_continues_on_unrecognized_tree_learner(client): ...@@ -961,23 +966,37 @@ def test_warns_and_continues_on_unrecognized_tree_learner(client):
client.close(timeout=CLIENT_CLOSE_TIMEOUT) client.close(timeout=CLIENT_CLOSE_TIMEOUT)
def test_warns_but_makes_no_changes_for_feature_or_voting_tree_learner(client): @pytest.mark.parametrize('tree_learner', ['data_parallel', 'voting_parallel'])
X = da.random.random((1e3, 10)) def test_training_respects_tree_learner_aliases(tree_learner, client):
y = da.random.random((1e3, 1)) task = 'regression'
for tree_learner in ['feature_parallel', 'voting']: _, _, _, _, dX, dy, dw, dg = _create_data(objective=task, output='array')
dask_factory = task_to_dask_factory[task]
dask_model = dask_factory(
client=client,
tree_learner=tree_learner,
time_out=5,
n_estimators=10,
num_leaves=15
)
dask_model.fit(dX, dy, sample_weight=dw, group=dg)
assert dask_model.fitted_
assert dask_model.get_params()['tree_learner'] == tree_learner
def test_error_on_feature_parallel_tree_learner(client):
X = da.random.random((100, 10), chunks=(50, 10))
y = da.random.random(100, chunks=50)
dask_regressor = lgb.DaskLGBMRegressor( dask_regressor = lgb.DaskLGBMRegressor(
client=client, client=client,
time_out=5, time_out=5,
tree_learner=tree_learner, tree_learner='feature_parallel',
n_estimators=1, n_estimators=1,
num_leaves=2 num_leaves=2
) )
with pytest.warns(UserWarning, match='Support for tree_learner %s in lightgbm' % tree_learner): with pytest.raises(lgb.basic.LightGBMError, match='Do not support feature parallel in c api'):
dask_regressor = dask_regressor.fit(X, y) dask_regressor = dask_regressor.fit(X, y)
assert dask_regressor.fitted_
assert dask_regressor.get_params()['tree_learner'] == tree_learner
client.close(timeout=CLIENT_CLOSE_TIMEOUT) client.close(timeout=CLIENT_CLOSE_TIMEOUT)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment