[tests][python] refined python tests (#2483)

* speed up tests * more updates * fixed pylint * updated tests * Update test_sklearn.py * test that indices are sorted internally

[tests][python] refined python tests (#2483)
* speed up tests * more updates * fixed pylint * updated tests * Update test_sklearn.py * test that indices are sorted internally
1f1dc452 · Nikita Titov · GitHub · 00d1e693 · 1f1dc452 · 1f1dc452
Unverified Commit 1f1dc452 authored Oct 27, 2019 by Nikita Titov Committed by GitHub Oct 27, 2019
4 changed files
--- a/tests/c_api_test/test_.py
+++ b/tests/c_api_test/test_.py
@@ -69,7 +69,8 @@ def load_from_file(filename, reference):
    LIB.LGBM_DatasetCreateFromFile(
        c_str(filename),
        c_str('max_bin=15'),
-        ref, ctypes.byref(handle))
+        ref,
+        ctypes.byref(handle))
    print(LIB.LGBM_GetLastError())
    num_data = ctypes.c_long()
    LIB.LGBM_DatasetGetNumData(handle, ctypes.byref(num_data))
@@ -88,8 +89,9 @@ def load_from_csr(filename, reference):
    label = []
    with open(filename, 'r') as inp:
        for line in inp.readlines():
-            data.append([float(x) for x in line.split('\t')[1:]])
-            label.append(float(line.split('\t')[0]))
+            values = line.split('\t')
+            data.append([float(x) for x in values[1:]])
+            label.append(float(values[0]))
    mat = np.array(data)
    label = np.array(label, dtype=np.float32)
    csr = sparse.csr_matrix(mat)
@@ -124,8 +126,9 @@ def load_from_csc(filename, reference):
    label = []
    with open(filename, 'r') as inp:
        for line in inp.readlines():
-            data.append([float(x) for x in line.split('\t')[1:]])
-            label.append(float(line.split('\t')[0]))
+            values = line.split('\t')
+            data.append([float(x) for x in values[1:]])
+            label.append(float(values[0]))
    mat = np.array(data)
    label = np.array(label, dtype=np.float32)
    csr = sparse.csc_matrix(mat)
@@ -160,8 +163,9 @@ def load_from_mat(filename, reference):
    label = []
    with open(filename, 'r') as inp:
        for line in inp.readlines():
-            data.append([float(x) for x in line.split('\t')[1:]])
-            label.append(float(line.split('\t')[0]))
+            values = line.split('\t')
+            data.append([float(x) for x in values[1:]])
+            label.append(float(values[0]))
    mat = np.array(data)
    data = np.array(mat.reshape(mat.size), copy=False)
    label = np.array(label, dtype=np.float32)
@@ -222,7 +226,7 @@ def test_booster():
        ctypes.byref(booster))
    LIB.LGBM_BoosterAddValidData(booster, test)
    is_finished = ctypes.c_int(0)
-    for i in range(1, 101):
+    for i in range(1, 51):
        LIB.LGBM_BoosterUpdateOneIter(booster, ctypes.byref(is_finished))
        result = np.array([0.0], dtype=np.float64)
        out_len = ctypes.c_ulong(0)
@@ -260,7 +264,7 @@ def test_booster():
        mat.shape[1],
        1,
        1,
-        50,
+        25,
        c_str(''),
        ctypes.byref(num_preb),
        preb.ctypes.data_as(ctypes.POINTER(ctypes.c_double)))
@@ -270,7 +274,7 @@ def test_booster():
                           '../../examples/binary_classification/binary.test')),
        0,
        0,
-        50,
+        25,
        c_str(''),
        c_str('preb.txt'))
    LIB.LGBM_BoosterFree(booster2)
--- a/tests/python_package_test/test_basic.py
+++ b/tests/python_package_test/test_basic.py
@@ -31,13 +31,13 @@ class TestBasic(unittest.TestCase):
        bst = lgb.Booster(params, train_data)
        bst.add_valid(valid_data, "valid_1")

-        for i in range(30):
+        for i in range(20):
            bst.update()
            if i % 10 == 0:
                print(bst.eval_train(), bst.eval_valid())

-        self.assertEqual(bst.current_iteration(), 30)
-        self.assertEqual(bst.num_trees(), 30)
+        self.assertEqual(bst.current_iteration(), 20)
+        self.assertEqual(bst.num_trees(), 20)
        self.assertEqual(bst.num_model_per_iteration(), 1)

        bst.save_model("model.txt")
@@ -48,26 +48,20 @@ class TestBasic(unittest.TestCase):
            dump_svmlight_file(X_test, y_test, f)
        pred_from_file = bst.predict(tname)
        os.remove(tname)
-        self.assertEqual(len(pred_from_matr), len(pred_from_file))
-        for preds in zip(pred_from_matr, pred_from_file):
-            self.assertAlmostEqual(*preds, places=15)
+        np.testing.assert_allclose(pred_from_matr, pred_from_file)

        # check saved model persistence
        bst = lgb.Booster(params, model_file="model.txt")
        os.remove("model.txt")
        pred_from_model_file = bst.predict(X_test)
-        self.assertEqual(len(pred_from_matr), len(pred_from_model_file))
-        for preds in zip(pred_from_matr, pred_from_model_file):
-            # we need to check the consistency of model file here, so test for exact equal
-            self.assertEqual(*preds)
+        # we need to check the consistency of model file here, so test for exact equal
+        np.testing.assert_array_equal(pred_from_matr, pred_from_model_file)

        # check early stopping is working. Make it stop very early, so the scores should be very close to zero
        pred_parameter = {"pred_early_stop": True, "pred_early_stop_freq": 5, "pred_early_stop_margin": 1.5}
        pred_early_stopping = bst.predict(X_test, **pred_parameter)
-        self.assertEqual(len(pred_from_matr), len(pred_early_stopping))
-        for preds in zip(pred_early_stopping, pred_from_matr):
-            # scores likely to be different, but prediction should still be the same
-            self.assertEqual(preds[0] > 0, preds[1] > 0)
+        # scores likely to be different, but prediction should still be the same
+        np.testing.assert_array_equal(np.sign(pred_from_matr), np.sign(pred_early_stopping))

        # test that shape is checked during prediction
        bad_X_test = X_test[:, 1:]
@@ -97,7 +91,6 @@ class TestBasic(unittest.TestCase):

        train_data = lgb.Dataset(X_train, label=y_train, params={"bin_construct_sample_cnt": 100})
        valid_data = train_data.create_valid(X_test, label=y_test, params={"bin_construct_sample_cnt": 100})
-
        train_data.construct()
        valid_data.construct()

@@ -108,23 +101,23 @@ class TestBasic(unittest.TestCase):
                                          '../../examples/lambdarank/rank.train.query'))
        lgb_train = lgb.Dataset(X_train, y_train, group=q_train)
        self.assertEqual(len(lgb_train.get_group()), 201)
-        subset = lgb_train.subset(list(lgb.compat.range_(10))).construct()
+        subset = lgb_train.subset(list(range(10))).construct()
        subset_group = subset.get_group()
        self.assertEqual(len(subset_group), 2)
        self.assertEqual(subset_group[0], 1)
        self.assertEqual(subset_group[1], 9)

    def test_add_features_throws_if_num_data_unequal(self):
-        X1 = np.random.random((1000, 1))
-        X2 = np.random.random((100, 1))
+        X1 = np.random.random((100, 1))
+        X2 = np.random.random((10, 1))
        d1 = lgb.Dataset(X1).construct()
        d2 = lgb.Dataset(X2).construct()
        with self.assertRaises(lgb.basic.LightGBMError):
            d1.add_features_from(d2)

    def test_add_features_throws_if_datasets_unconstructed(self):
-        X1 = np.random.random((1000, 1))
-        X2 = np.random.random((1000, 1))
+        X1 = np.random.random((100, 1))
+        X2 = np.random.random((100, 1))
        with self.assertRaises(ValueError):
            d1 = lgb.Dataset(X1)
            d2 = lgb.Dataset(X2)
@@ -139,7 +132,8 @@ class TestBasic(unittest.TestCase):
            d1.add_features_from(d2)

    def test_add_features_equal_data_on_alternating_used_unused(self):
-        X = np.random.random((1000, 5))
+        self.maxDiff = None
+        X = np.random.random((100, 5))
        X[:, [1, 3]] = 0
        names = ['col_%d' % i for i in range(5)]
        for j in range(1, 5):
@@ -162,7 +156,8 @@ class TestBasic(unittest.TestCase):
            self.assertEqual(dtxt, d1txt)

    def test_add_features_same_booster_behaviour(self):
-        X = np.random.random((1000, 5))
+        self.maxDiff = None
+        X = np.random.random((100, 5))
        X[:, [1, 3]] = 0
        names = ['col_%d' % i for i in range(5)]
        for j in range(1, 5):
@@ -170,7 +165,7 @@ class TestBasic(unittest.TestCase):
            d2 = lgb.Dataset(X[:, j:], feature_name=names[j:]).construct()
            d1.add_features_from(d2)
            d = lgb.Dataset(X, feature_name=names).construct()
-            y = np.random.random(1000)
+            y = np.random.random(100)
            d1.set_label(y)
            d.set_label(y)
            b1 = lgb.Booster(train_set=d1)
@@ -191,7 +186,7 @@ class TestBasic(unittest.TestCase):
            self.assertEqual(dtxt, d1txt)

    def test_get_feature_penalty_and_monotone_constraints(self):
-        X = np.random.random((1000, 1))
+        X = np.random.random((100, 1))
        d = lgb.Dataset(X, params={'feature_penalty': [0.5],
                                   'monotone_constraints': [1]}).construct()
        np.testing.assert_allclose(d.get_feature_penalty(), [0.5])
@@ -201,7 +196,7 @@ class TestBasic(unittest.TestCase):
        self.assertIsNone(d.get_monotone_constraints())

    def test_add_features_feature_penalty(self):
-        X = np.random.random((1000, 2))
+        X = np.random.random((100, 2))
        test_cases = [
            (None, None, None),
            ([0.5], None, [0.5, 1]),
@@ -220,7 +215,7 @@ class TestBasic(unittest.TestCase):
                np.testing.assert_allclose(actual, expected)

    def test_add_features_monotone_types(self):
-        X = np.random.random((1000, 2))
+        X = np.random.random((100, 2))
        test_cases = [
            (None, None, None),
            ([1], None, [1, 0]),
@@ -239,9 +234,9 @@ class TestBasic(unittest.TestCase):
                np.testing.assert_array_equal(actual, expected)

    def test_cegb_affects_behavior(self):
-        X = np.random.random((1000, 5))
+        X = np.random.random((100, 5))
        X[:, [1, 3]] = 0
-        y = np.random.random(1000)
+        y = np.random.random(100)
        names = ['col_%d' % i for i in range(5)]
        ds = lgb.Dataset(X, feature_name=names).construct()
        ds.set_label(y)
@@ -269,9 +264,9 @@ class TestBasic(unittest.TestCase):
            self.assertNotEqual(basetxt, casetxt)

    def test_cegb_scaling_equalities(self):
-        X = np.random.random((1000, 5))
+        X = np.random.random((100, 5))
        X[:, [1, 3]] = 0
-        y = np.random.random(1000)
+        y = np.random.random(100)
        names = ['col_%d' % i for i in range(5)]
        ds = lgb.Dataset(X, feature_name=names).construct()
        ds.set_label(y)
@@ -298,9 +293,9 @@ class TestBasic(unittest.TestCase):
            with tempfile.NamedTemporaryFile() as f:
                p2name = f.name
            booster2.save_model(p2name)
-            self.maxDiff = None
            with open(p2name, 'rt') as f:
                p2txt = f.read()
+            self.maxDiff = None
            self.assertEqual(p1txt, p2txt)

    def test_consistent_state_for_dataset_fields(self):

--- a/tests/python_package_test/test_engine.py
+++ b/tests/python_package_test/test_engine.py
--- a/tests/python_package_test/test_sklearn.py
+++ b/tests/python_package_test/test_sklearn.py