Unverified Commit f2a32f9d authored by Nikita Titov's avatar Nikita Titov Committed by GitHub
Browse files

[python-package][tests] enhance `test_set_field_none_removes_field` test (#7044)



* Update test_basic.py

* dev

* dev

* dev

* dev

* dev

* dev

* Update test_basic.py

* Update test_basic.py

* Update test_basic.py

* Update test_basic.py

* Update test_basic.py

* dev

* dev

* dev

* dev

* dev

* dev

* dev

* dev

* dev

---------
Co-authored-by: default avatarJames Lamb <jaylamb20@gmail.com>
parent fd1a0f4a
......@@ -69,13 +69,13 @@ def test_basic(tmp_path):
assert bst.feature_name() == feature_names
pred_from_model_file = bst.predict(X_test)
# we need to check the consistency of model file here, so test for exact equal
np.testing.assert_array_equal(pred_from_matr, pred_from_model_file)
np_assert_array_equal(pred_from_matr, pred_from_model_file, strict=True)
# check early stopping is working. Make it stop very early, so the scores should be very close to zero
pred_parameter = {"pred_early_stop": True, "pred_early_stop_freq": 5, "pred_early_stop_margin": 1.5}
pred_early_stopping = bst.predict(X_test, **pred_parameter)
# scores likely to be different, but prediction should still be the same
np.testing.assert_array_equal(np.sign(pred_from_matr), np.sign(pred_early_stopping))
np_assert_array_equal(np.sign(pred_from_matr), np.sign(pred_early_stopping), strict=True)
# test that shape is checked during prediction
bad_X_test = X_test[:, 1:]
......@@ -213,7 +213,7 @@ def test_sequence_get_data(num_seq, rng):
used_indices = rng.choice(a=np.arange(nrow), size=nrow // 3, replace=False)
subset_data = seq_ds.subset(used_indices).construct()
np.testing.assert_array_equal(subset_data.get_data(), X[sorted(used_indices)])
np_assert_array_equal(subset_data.get_data(), X[sorted(used_indices)], strict=True)
def test_chunked_dataset():
......@@ -1019,14 +1019,39 @@ def test_equal_datasets_from_one_and_several_matrices_w_different_layouts(rng, t
assert filecmp.cmp(one_path, several_path)
def test_set_field_none_removes_field(rng):
X1 = rng.uniform(size=(10, 1))
d1 = lgb.Dataset(X1).construct()
weight = rng.uniform(size=10)
out = d1.set_field("weight", weight)
assert out is d1
@pytest.mark.parametrize(
"field_name",
[
"group",
"init_score",
pytest.param(
"position",
marks=pytest.mark.skipif(
getenv("TASK", "") == "cuda",
reason="Positions in learning to rank is not supported in CUDA version yet",
),
),
"weight",
],
)
def test_set_field_none_removes_field(rng, field_name):
X = rng.uniform(size=(10, 1))
d = lgb.Dataset(X).construct()
if field_name == "group":
field = [5, 5]
expected = np.array([0, 5, 10], dtype=np.int32)
elif field_name == "position":
field = [100, 20, 100, 10, 30, 10, 30, 10, 30, 30]
expected = np.array([0, 1, 0, 2, 3, 2, 3, 2, 3, 3], dtype=np.int32)
else:
field = rng.uniform(size=10)
expected = field.astype(np.float64 if field_name == "init_score" else np.float32)
out = d.set_field(field_name, field)
assert out is d
np.testing.assert_allclose(d1.get_field("weight"), weight)
np_assert_array_equal(d.get_field(field_name), expected, strict=True)
d1.set_field("weight", None)
assert d1.get_field("weight") is None
d.set_field(field_name, None)
assert d.get_field(field_name) is None
......@@ -14,7 +14,7 @@ from sklearn.metrics import accuracy_score, r2_score
import lightgbm as lgb
from .utils import sklearn_multiclass_custom_objective
from .utils import np_assert_array_equal, sklearn_multiclass_custom_objective
if platform in {"cygwin", "win32"}:
pytest.skip("lightgbm.dask is not currently supported on Windows", allow_module_level=True)
......@@ -370,7 +370,7 @@ def test_classifier_pred_contrib(output, task, cluster):
# raw scores will probably be different, but at least check that all predicted classes are the same
pred_classes = np.argmax(computed_preds.toarray(), axis=1)
local_pred_classes = np.argmax(local_preds_with_contrib[i].toarray(), axis=1)
np.testing.assert_array_equal(pred_classes, local_pred_classes)
np_assert_array_equal(pred_classes, local_pred_classes, strict=True)
return
preds_with_contrib = preds_with_contrib.compute()
......
......@@ -40,6 +40,7 @@ from .utils import (
logistic_sigmoid,
make_synthetic_regression,
mse_obj,
np_assert_array_equal,
pickle_and_unpickle_object,
sklearn_multiclass_custom_objective,
softmax,
......@@ -852,7 +853,7 @@ def test_ranking_with_position_information_with_dataset_constructor(tmp_path):
# test get_position works
positions_from_get = lgb_train.get_position()
np.testing.assert_array_equal(positions_from_get, positions)
np_assert_array_equal(positions_from_get, positions, strict=True)
def test_early_stopping():
......@@ -1398,7 +1399,7 @@ def test_cvbooster_save_load(tmp_path):
cvbooster_from_string = lgb.CVBooster().model_from_string(model_string)
for cvbooster_loaded in [cvbooster_from_txt_file, cvbooster_from_string]:
assert best_iteration == cvbooster_loaded.best_iteration
np.testing.assert_array_equal(preds, cvbooster_loaded.predict(X_test))
np_assert_array_equal(preds, cvbooster_loaded.predict(X_test), strict=True)
@pytest.mark.parametrize("serializer", SERIALIZERS)
......@@ -1431,7 +1432,7 @@ def test_cvbooster_picklable(serializer):
assert best_iteration == cvbooster_from_disk.best_iteration
preds_from_disk = cvbooster_from_disk.predict(X_test)
np.testing.assert_array_equal(preds, preds_from_disk)
np_assert_array_equal(preds, preds_from_disk, strict=True)
def test_feature_name():
......@@ -2311,7 +2312,7 @@ def test_monotone_penalty_max():
constrained_model = lgb.train(params_constrained_model, trainset_constrained_model, 10)
# Check that a very high penalization is the same as not using the features at all
np.testing.assert_array_equal(constrained_model.predict(x), unconstrained_model_predictions)
np_assert_array_equal(constrained_model.predict(x), unconstrained_model_predictions, strict=True)
def test_max_bin_by_feature():
......@@ -3186,22 +3187,24 @@ def test_get_split_value_histogram(rng_fixed_seed):
assert len(bins) == 8
hist_idx, bins_idx = gbm.get_split_value_histogram(0)
hist_name, bins_name = gbm.get_split_value_histogram(gbm.feature_name()[0])
np.testing.assert_array_equal(hist_idx, hist_name)
np_assert_array_equal(hist_idx, hist_name, strict=True)
np.testing.assert_allclose(bins_idx, bins_name)
hist_idx, bins_idx = gbm.get_split_value_histogram(X.shape[-1] - 1)
hist_name, bins_name = gbm.get_split_value_histogram(gbm.feature_name()[X.shape[-1] - 1])
np.testing.assert_array_equal(hist_idx, hist_name)
np_assert_array_equal(hist_idx, hist_name, strict=True)
np.testing.assert_allclose(bins_idx, bins_name)
# test bins string type
hist_vals, bin_edges = gbm.get_split_value_histogram(0, bins="auto")
hist = gbm.get_split_value_histogram(0, bins="auto", xgboost_style=True)
if lgb.compat.PANDAS_INSTALLED:
mask = hist_vals > 0
np.testing.assert_array_equal(hist_vals[mask], hist["Count"].values)
# strict=False due to dtype mismatch: 'int64' and 'float64'
np_assert_array_equal(hist_vals[mask], hist["Count"].values, strict=False)
np.testing.assert_allclose(bin_edges[1:][mask], hist["SplitValue"].values)
else:
mask = hist_vals > 0
np.testing.assert_array_equal(hist_vals[mask], hist[:, 1])
# strict=False due to dtype mismatch: 'int64' and 'float64'
np_assert_array_equal(hist_vals[mask], hist[:, 1], strict=False)
np.testing.assert_allclose(bin_edges[1:][mask], hist[:, 0])
# test histogram is disabled for categorical features
with pytest.raises(
......
......@@ -42,6 +42,7 @@ from .utils import (
load_linnerud,
make_ranking,
make_synthetic_regression,
np_assert_array_equal,
sklearn_multiclass_custom_objective,
softmax,
)
......@@ -423,7 +424,7 @@ def test_multioutput_classifier():
score = clf.score(X_test, y_test)
assert score >= 0.2
assert score <= 1.0
np.testing.assert_array_equal(np.tile(np.unique(y_train), n_outputs), np.concatenate(clf.classes_))
np_assert_array_equal(np.tile(np.unique(y_train), n_outputs), np.concatenate(clf.classes_), strict=True)
for classifier in clf.estimators_:
assert isinstance(classifier, lgb.LGBMClassifier)
assert isinstance(classifier.booster_, lgb.Booster)
......@@ -454,7 +455,7 @@ def test_classifier_chain():
score = clf.score(X_test, y_test)
assert score >= 0.2
assert score <= 1.0
np.testing.assert_array_equal(np.tile(np.unique(y_train), n_outputs), np.concatenate(clf.classes_))
np_assert_array_equal(np.tile(np.unique(y_train), n_outputs), np.concatenate(clf.classes_), strict=True)
assert order == clf.order_
for classifier in clf.estimators_:
assert isinstance(classifier, lgb.LGBMClassifier)
......@@ -709,7 +710,7 @@ def test_joblib(tmp_path):
gbm_pickle = joblib.load(model_path_pkl)
assert isinstance(gbm_pickle.booster_, lgb.Booster)
assert gbm.get_params() == gbm_pickle.get_params()
np.testing.assert_array_equal(gbm.feature_importances_, gbm_pickle.feature_importances_)
np_assert_array_equal(gbm.feature_importances_, gbm_pickle.feature_importances_, strict=True)
assert gbm_pickle.learning_rate == pytest.approx(0.1)
assert callable(gbm_pickle.objective)
......@@ -750,7 +751,7 @@ def test_random_state_object(rng_constructor):
y_pred1 = clf1.predict(X_test, raw_score=True)
y_pred2 = clf2.predict(X_test, raw_score=True)
np.testing.assert_allclose(y_pred1, y_pred2)
np.testing.assert_array_equal(clf1.feature_importances_, clf2.feature_importances_)
np_assert_array_equal(clf1.feature_importances_, clf2.feature_importances_, strict=True)
df1 = clf1.booster_.model_to_string(num_iteration=0)
df2 = clf2.booster_.model_to_string(num_iteration=0)
assert df1 == df2
......@@ -1514,13 +1515,13 @@ def test_continue_training_with_model():
def test_actual_number_of_trees():
X = [[1, 2, 3], [1, 2, 3]]
y = [1, 1]
y = [1.0, 1.0]
n_estimators = 5
gbm = lgb.LGBMRegressor(n_estimators=n_estimators).fit(X, y)
assert gbm.n_estimators == n_estimators
assert gbm.n_estimators_ == 1
assert gbm.n_iter_ == 1
np.testing.assert_array_equal(gbm.predict(np.array(X) * 10), y)
np_assert_array_equal(gbm.predict(np.array(X) * 10), y, strict=True)
def test_check_is_fitted():
......@@ -1638,7 +1639,7 @@ def test_getting_feature_names_in_np_input(estimator_class):
model.fit(X, y, group=[X.shape[0]])
else:
model.fit(X, y)
np.testing.assert_array_equal(model.feature_names_in_, np.array([f"Column_{i}" for i in range(X.shape[1])]))
np_assert_array_equal(model.feature_names_in_, np.array([f"Column_{i}" for i in range(X.shape[1])]), strict=True)
@pytest.mark.parametrize("estimator_class", estimator_classes)
......@@ -1661,7 +1662,8 @@ def test_getting_feature_names_in_pd_input(estimator_class):
model.fit(X, y, group=[X.shape[0]])
else:
model.fit(X, y)
np.testing.assert_array_equal(model.feature_names_in_, X.columns)
# strict=False due to dtype mismatch: '<U9' and 'object'
np_assert_array_equal(model.feature_names_in_, X.columns, strict=False)
# Starting with scikit-learn 1.6 (https://github.com/scikit-learn/scikit-learn/pull/30149),
......@@ -1741,7 +1743,7 @@ def test_training_succeeds_when_data_is_dataframe_and_label_is_column_array(task
preds_1d = model_1d.predict(X)
preds_2d = model_2d.predict(X)
np.testing.assert_array_equal(preds_1d, preds_2d)
np_assert_array_equal(preds_1d, preds_2d, strict=True)
@pytest.mark.parametrize("use_weight", [True, False])
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment