Unverified Commit f879018b authored by jmoralez's avatar jmoralez Committed by GitHub
Browse files

[tests][dask] test all boosting types (fixes #3896) (#4119)

* test all boosting types

* lint

* bring scores comparison back and set y as second argument in assert_eq
parent e4cf2e4f
...@@ -45,6 +45,7 @@ CLIENT_CLOSE_TIMEOUT = 120 ...@@ -45,6 +45,7 @@ CLIENT_CLOSE_TIMEOUT = 120
tasks = ['binary-classification', 'multiclass-classification', 'regression', 'ranking'] tasks = ['binary-classification', 'multiclass-classification', 'regression', 'ranking']
data_output = ['array', 'scipy_csr_matrix', 'dataframe', 'dataframe-with-categorical'] data_output = ['array', 'scipy_csr_matrix', 'dataframe', 'dataframe-with-categorical']
boosting_types = ['gbdt', 'dart', 'goss', 'rf']
group_sizes = [5, 5, 5, 10, 10, 10, 20, 20, 20, 50, 50] group_sizes = [5, 5, 5, 10, 10, 10, 20, 20, 20, 50, 50]
task_to_dask_factory = { task_to_dask_factory = {
'regression': lgb.DaskLGBMRegressor, 'regression': lgb.DaskLGBMRegressor,
...@@ -233,16 +234,25 @@ def _unpickle(filepath, serializer): ...@@ -233,16 +234,25 @@ def _unpickle(filepath, serializer):
@pytest.mark.parametrize('output', data_output) @pytest.mark.parametrize('output', data_output)
@pytest.mark.parametrize('task', ['binary-classification', 'multiclass-classification']) @pytest.mark.parametrize('task', ['binary-classification', 'multiclass-classification'])
def test_classifier(output, task, client): @pytest.mark.parametrize('boosting_type', boosting_types)
def test_classifier(output, task, boosting_type, client):
X, y, w, _, dX, dy, dw, _ = _create_data( X, y, w, _, dX, dy, dw, _ = _create_data(
objective=task, objective=task,
output=output output=output
) )
params = { params = {
"boosting_type": boosting_type,
"n_estimators": 50, "n_estimators": 50,
"num_leaves": 31 "num_leaves": 31
} }
if boosting_type == 'rf':
params.update({
'bagging_freq': 1,
'bagging_fraction': 0.9,
})
elif boosting_type == 'goss':
params['top_rate'] = 0.5
dask_classifier = lgb.DaskLGBMClassifier( dask_classifier = lgb.DaskLGBMClassifier(
client=client, client=client,
...@@ -263,13 +273,18 @@ def test_classifier(output, task, client): ...@@ -263,13 +273,18 @@ def test_classifier(output, task, client):
p2_proba = local_classifier.predict_proba(X) p2_proba = local_classifier.predict_proba(X)
s2 = local_classifier.score(X, y) s2 = local_classifier.score(X, y)
assert_eq(s1, s2) if boosting_type == 'rf' and output == 'dataframe-with-categorical':
assert_eq(p1, p2) # https://github.com/microsoft/LightGBM/issues/4118
assert_eq(y, p1) assert_eq(s1, s2, atol=0.01)
assert_eq(y, p2) assert_eq(p1_proba, p2_proba, atol=0.8)
assert_eq(p1_proba, p2_proba, atol=0.01) else:
assert_eq(p1_local, p2) assert_eq(s1, s2)
assert_eq(y, p1_local) assert_eq(p1, p2)
assert_eq(p1, y)
assert_eq(p2, y)
assert_eq(p1_proba, p2_proba, atol=0.03)
assert_eq(p1_local, p2)
assert_eq(p1_local, y)
# pref_leaf values should have the right shape # pref_leaf values should have the right shape
# and values that look like valid tree nodes # and values that look like valid tree nodes
...@@ -401,17 +416,24 @@ def test_training_does_not_fail_on_port_conflicts(client): ...@@ -401,17 +416,24 @@ def test_training_does_not_fail_on_port_conflicts(client):
@pytest.mark.parametrize('output', data_output) @pytest.mark.parametrize('output', data_output)
def test_regressor(output, client): @pytest.mark.parametrize('boosting_type', boosting_types)
def test_regressor(output, boosting_type, client):
X, y, w, _, dX, dy, dw, _ = _create_data( X, y, w, _, dX, dy, dw, _ = _create_data(
objective='regression', objective='regression',
output=output output=output
) )
params = { params = {
"boosting_type": boosting_type,
"random_state": 42, "random_state": 42,
"num_leaves": 31, "num_leaves": 31,
"n_estimators": 20, "n_estimators": 20,
} }
if boosting_type == 'rf':
params.update({
'bagging_freq': 1,
'bagging_fraction': 0.9,
})
dask_regressor = lgb.DaskLGBMRegressor( dask_regressor = lgb.DaskLGBMRegressor(
client=client, client=client,
...@@ -569,7 +591,8 @@ def test_regressor_quantile(output, client, alpha): ...@@ -569,7 +591,8 @@ def test_regressor_quantile(output, client, alpha):
@pytest.mark.parametrize('output', ['array', 'dataframe', 'dataframe-with-categorical']) @pytest.mark.parametrize('output', ['array', 'dataframe', 'dataframe-with-categorical'])
@pytest.mark.parametrize('group', [None, group_sizes]) @pytest.mark.parametrize('group', [None, group_sizes])
def test_ranker(output, client, group): @pytest.mark.parametrize('boosting_type', boosting_types)
def test_ranker(output, group, boosting_type, client):
if output == 'dataframe-with-categorical': if output == 'dataframe-with-categorical':
X, y, w, g, dX, dy, dw, dg = _create_data( X, y, w, g, dX, dy, dw, dg = _create_data(
objective='ranking', objective='ranking',
...@@ -597,11 +620,17 @@ def test_ranker(output, client, group): ...@@ -597,11 +620,17 @@ def test_ranker(output, client, group):
# use many trees + leaves to overfit, help ensure that Dask data-parallel strategy matches that of # use many trees + leaves to overfit, help ensure that Dask data-parallel strategy matches that of
# serial learner. See https://github.com/microsoft/LightGBM/issues/3292#issuecomment-671288210. # serial learner. See https://github.com/microsoft/LightGBM/issues/3292#issuecomment-671288210.
params = { params = {
"boosting_type": boosting_type,
"random_state": 42, "random_state": 42,
"n_estimators": 50, "n_estimators": 50,
"num_leaves": 20, "num_leaves": 20,
"min_child_samples": 1 "min_child_samples": 1
} }
if boosting_type == 'rf':
params.update({
'bagging_freq': 1,
'bagging_fraction': 0.9,
})
dask_ranker = lgb.DaskLGBMRanker( dask_ranker = lgb.DaskLGBMRanker(
client=client, client=client,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment