test_basic.py 3.28 KB
Newer Older
wxchan's avatar
wxchan committed
1
# coding: utf-8
wxchan's avatar
wxchan committed
2
# pylint: skip-file
wxchan's avatar
wxchan committed
3
4
5
6
7
import os
import tempfile
import unittest

import lightgbm as lgb
wxchan's avatar
wxchan committed
8
import numpy as np
Guolin Ke's avatar
Guolin Ke committed
9
from sklearn.datasets import load_breast_cancer, dump_svmlight_file
wxchan's avatar
wxchan committed
10
from sklearn.model_selection import train_test_split
wxchan's avatar
wxchan committed
11

wxchan's avatar
wxchan committed
12

wxchan's avatar
wxchan committed
13
class TestBasic(unittest.TestCase):
wxchan's avatar
wxchan committed
14

wxchan's avatar
wxchan committed
15
    def test(self):
Guolin Ke's avatar
Guolin Ke committed
16
        X_train, X_test, y_train, y_test = train_test_split(*load_breast_cancer(True), test_size=0.1, random_state=2)
17
        train_data = lgb.Dataset(X_train, label=y_train)
wxchan's avatar
wxchan committed
18
        valid_data = train_data.create_valid(X_test, label=y_test)
wxchan's avatar
wxchan committed
19

wxchan's avatar
wxchan committed
20
        params = {
wxchan's avatar
wxchan committed
21
22
            "objective": "binary",
            "metric": "auc",
Guolin Ke's avatar
Guolin Ke committed
23
            "min_data": 10,
wxchan's avatar
wxchan committed
24
            "num_leaves": 15,
25
            "verbose": -1,
26
27
            "num_threads": 1,
            "max_bin": 255
wxchan's avatar
wxchan committed
28
29
30
        }
        bst = lgb.Booster(params, train_data)
        bst.add_valid(valid_data, "valid_1")
wxchan's avatar
wxchan committed
31

wxchan's avatar
wxchan committed
32
33
34
35
        for i in range(30):
            bst.update()
            if i % 10 == 0:
                print(bst.eval_train(), bst.eval_valid())
36
37
38
39
40

        self.assertEqual(bst.current_iteration(), 30)
        self.assertEqual(bst.num_trees(), 30)
        self.assertEqual(bst.num_model_per_iteration(), 1)

wxchan's avatar
wxchan committed
41
42
43
        bst.save_model("model.txt")
        pred_from_matr = bst.predict(X_test)
        with tempfile.NamedTemporaryFile() as f:
44
45
            tname = f.name
        with open(tname, "w+b") as f:
Guolin Ke's avatar
Guolin Ke committed
46
            dump_svmlight_file(X_test, y_test, f)
47
48
        pred_from_file = bst.predict(tname)
        os.remove(tname)
wxchan's avatar
wxchan committed
49
50
        self.assertEqual(len(pred_from_matr), len(pred_from_file))
        for preds in zip(pred_from_matr, pred_from_file):
51
            self.assertAlmostEqual(*preds, places=15)
cbecker's avatar
cbecker committed
52

wxchan's avatar
wxchan committed
53
        # check saved model persistence
54
55
56
57
        bst = lgb.Booster(params, model_file="model.txt")
        pred_from_model_file = bst.predict(X_test)
        self.assertEqual(len(pred_from_matr), len(pred_from_model_file))
        for preds in zip(pred_from_matr, pred_from_model_file):
58
            # we need to check the consistency of model file here, so test for exact equal
59
            self.assertEqual(*preds)
cbecker's avatar
cbecker committed
60
61

        # check early stopping is working. Make it stop very early, so the scores should be very close to zero
62
        pred_parameter = {"pred_early_stop": True, "pred_early_stop_freq": 5, "pred_early_stop_margin": 1.5}
63
        pred_early_stopping = bst.predict(X_test, **pred_parameter)
cbecker's avatar
cbecker committed
64
65
66
67
        self.assertEqual(len(pred_from_matr), len(pred_early_stopping))
        for preds in zip(pred_early_stopping, pred_from_matr):
            # scores likely to be different, but prediction should still be the same
            self.assertEqual(preds[0] > 0, preds[1] > 0)
68
69
70
71
72
73
74
75
76
77
78
79
80

    def test_chunked_dataset(self):
        X_train, X_test, y_train, y_test = train_test_split(*load_breast_cancer(True), test_size=0.1, random_state=2)

        chunk_size = X_train.shape[0] // 10 + 1
        X_train = [X_train[i * chunk_size:(i + 1) * chunk_size, :] for i in range(X_train.shape[0] // chunk_size + 1)]
        X_test = [X_test[i * chunk_size:(i + 1) * chunk_size, :] for i in range(X_test.shape[0] // chunk_size + 1)]

        train_data = lgb.Dataset(X_train, label=y_train, params={"bin_construct_sample_cnt": 100})
        valid_data = train_data.create_valid(X_test, label=y_test, params={"bin_construct_sample_cnt": 100})

        train_data.construct()
        valid_data.construct()