Unverified Commit 8cc6eefc authored by jmoralez's avatar jmoralez Committed by GitHub
Browse files

[tests][dask] Create an informative categorical feature (#4113)

* make one categorical variable informative. increase n_samples. reduce n_features for regression

* adjust tolerances in checks
parent a45ed16f
...@@ -131,7 +131,7 @@ def _create_ranking_data(n_samples=100, output='array', chunk_size=50, **kwargs) ...@@ -131,7 +131,7 @@ def _create_ranking_data(n_samples=100, output='array', chunk_size=50, **kwargs)
return X, y, w, g_rle, dX, dy, dw, dg return X, y, w, g_rle, dX, dy, dw, dg
def _create_data(objective, n_samples=100, output='array', chunk_size=50, **kwargs): def _create_data(objective, n_samples=1_000, output='array', chunk_size=500, **kwargs):
if objective.endswith('classification'): if objective.endswith('classification'):
if objective == 'binary-classification': if objective == 'binary-classification':
centers = [[-4, -4], [4, 4]] centers = [[-4, -4], [4, 4]]
...@@ -141,7 +141,7 @@ def _create_data(objective, n_samples=100, output='array', chunk_size=50, **kwar ...@@ -141,7 +141,7 @@ def _create_data(objective, n_samples=100, output='array', chunk_size=50, **kwar
raise ValueError(f"Unknown classification task '{objective}'") raise ValueError(f"Unknown classification task '{objective}'")
X, y = make_blobs(n_samples=n_samples, centers=centers, random_state=42) X, y = make_blobs(n_samples=n_samples, centers=centers, random_state=42)
elif objective == 'regression': elif objective == 'regression':
X, y = make_regression(n_samples=n_samples, random_state=42) X, y = make_regression(n_samples=n_samples, n_features=4, n_informative=2, random_state=42)
elif objective == 'ranking': elif objective == 'ranking':
return _create_ranking_data( return _create_ranking_data(
n_samples=n_samples, n_samples=n_samples,
...@@ -161,7 +161,7 @@ def _create_data(objective, n_samples=100, output='array', chunk_size=50, **kwar ...@@ -161,7 +161,7 @@ def _create_data(objective, n_samples=100, output='array', chunk_size=50, **kwar
elif output.startswith('dataframe'): elif output.startswith('dataframe'):
X_df = pd.DataFrame(X, columns=['feature_%d' % i for i in range(X.shape[1])]) X_df = pd.DataFrame(X, columns=['feature_%d' % i for i in range(X.shape[1])])
if output == 'dataframe-with-categorical': if output == 'dataframe-with-categorical':
num_cat_cols = 5 num_cat_cols = 2
for i in range(num_cat_cols): for i in range(num_cat_cols):
col_name = "cat_col" + str(i) col_name = "cat_col" + str(i)
cat_values = rnd.choice(['a', 'b'], X.shape[0]) cat_values = rnd.choice(['a', 'b'], X.shape[0])
...@@ -172,13 +172,15 @@ def _create_data(objective, n_samples=100, output='array', chunk_size=50, **kwar ...@@ -172,13 +172,15 @@ def _create_data(objective, n_samples=100, output='array', chunk_size=50, **kwar
X_df[col_name] = cat_series X_df[col_name] = cat_series
X = np.hstack((X, cat_series.cat.codes.values.reshape(-1, 1))) X = np.hstack((X, cat_series.cat.codes.values.reshape(-1, 1)))
# for the small data sizes used in tests, it's hard to get LGBMRegressor to choose # make one categorical feature relevant to the target
# categorical features for splits. So for regression tests with categorical features, cat_col_is_a = X_df['cat_col0'] == 'a'
# _create_data() returns a DataFrame with ONLY categorical features
if objective == 'regression': if objective == 'regression':
cat_cols = [col for col in X_df.columns if col.startswith('cat_col')] y = np.where(cat_col_is_a, y, 2 * y)
X_df = X_df[cat_cols] elif objective == 'binary-classification':
X = X[:, -num_cat_cols:] y = np.where(cat_col_is_a, y, 1 - y)
elif objective == 'multiclass-classification':
n_classes = 3
y = np.where(cat_col_is_a, y, (1 + y) % n_classes)
y_df = pd.Series(y, name='target') y_df = pd.Series(y, name='target')
dX = dd.from_pandas(X_df, chunksize=chunk_size) dX = dd.from_pandas(X_df, chunksize=chunk_size)
dy = dd.from_pandas(y_df, chunksize=chunk_size) dy = dd.from_pandas(y_df, chunksize=chunk_size)
...@@ -238,8 +240,8 @@ def test_classifier(output, task, client): ...@@ -238,8 +240,8 @@ def test_classifier(output, task, client):
) )
params = { params = {
"n_estimators": 10, "n_estimators": 50,
"num_leaves": 10 "num_leaves": 31
} }
dask_classifier = lgb.DaskLGBMClassifier( dask_classifier = lgb.DaskLGBMClassifier(
...@@ -265,7 +267,7 @@ def test_classifier(output, task, client): ...@@ -265,7 +267,7 @@ def test_classifier(output, task, client):
assert_eq(p1, p2) assert_eq(p1, p2)
assert_eq(y, p1) assert_eq(y, p1)
assert_eq(y, p2) assert_eq(y, p2)
assert_eq(p1_proba, p2_proba, atol=0.3) assert_eq(p1_proba, p2_proba, atol=0.01)
assert_eq(p1_local, p2) assert_eq(p1_local, p2)
assert_eq(y, p1_local) assert_eq(y, p1_local)
...@@ -407,7 +409,8 @@ def test_regressor(output, client): ...@@ -407,7 +409,8 @@ def test_regressor(output, client):
params = { params = {
"random_state": 42, "random_state": 42,
"num_leaves": 10 "num_leaves": 31,
"n_estimators": 20,
} }
dask_regressor = lgb.DaskLGBMRegressor( dask_regressor = lgb.DaskLGBMRegressor(
...@@ -420,8 +423,7 @@ def test_regressor(output, client): ...@@ -420,8 +423,7 @@ def test_regressor(output, client):
p1 = dask_regressor.predict(dX) p1 = dask_regressor.predict(dX)
p1_pred_leaf = dask_regressor.predict(dX, pred_leaf=True) p1_pred_leaf = dask_regressor.predict(dX, pred_leaf=True)
if not output.startswith('dataframe'): s1 = _r2_score(dy, p1)
s1 = _r2_score(dy, p1)
p1 = p1.compute() p1 = p1.compute()
p1_local = dask_regressor.to_local().predict(X) p1_local = dask_regressor.to_local().predict(X)
s1_local = dask_regressor.to_local().score(X, y) s1_local = dask_regressor.to_local().score(X, y)
...@@ -432,9 +434,8 @@ def test_regressor(output, client): ...@@ -432,9 +434,8 @@ def test_regressor(output, client):
p2 = local_regressor.predict(X) p2 = local_regressor.predict(X)
# Scores should be the same # Scores should be the same
if not output.startswith('dataframe'): assert_eq(s1, s2, atol=0.01)
assert_eq(s1, s2, atol=.01) assert_eq(s1, s1_local)
assert_eq(s1, s1_local, atol=.003)
# Predictions should be roughly the same. # Predictions should be roughly the same.
assert_eq(p1, p1_local) assert_eq(p1, p1_local)
...@@ -450,13 +451,8 @@ def test_regressor(output, client): ...@@ -450,13 +451,8 @@ def test_regressor(output, client):
assert np.min(pred_leaf_vals) >= 0 assert np.min(pred_leaf_vals) >= 0
assert len(np.unique(pred_leaf_vals)) <= params['num_leaves'] assert len(np.unique(pred_leaf_vals)) <= params['num_leaves']
# The checks below are skipped assert_eq(p1, y, rtol=0.5, atol=50.)
# for the categorical data case because it's difficult to get assert_eq(p2, y, rtol=0.5, atol=50.)
# a good fit from just categoricals for a regression problem
# with small data
if output != 'dataframe-with-categorical':
assert_eq(y, p1, rtol=1., atol=100.)
assert_eq(y, p2, rtol=1., atol=50.)
# be sure LightGBM actually used at least one categorical column, # be sure LightGBM actually used at least one categorical column,
# and that it was correctly treated as a categorical feature # and that it was correctly treated as a categorical feature
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment