test_engine.py 81.2 KB
Newer Older
Guolin Ke's avatar
Guolin Ke committed
1
# coding: utf-8
wxchan's avatar
wxchan committed
2
import copy
3
import itertools
wxchan's avatar
wxchan committed
4
5
import math
import os
6
import psutil
7
import random
wxchan's avatar
wxchan committed
8
9
import unittest

Guolin Ke's avatar
Guolin Ke committed
10
import lightgbm as lgb
wxchan's avatar
wxchan committed
11
import numpy as np
12
from scipy.sparse import csr_matrix
wxchan's avatar
wxchan committed
13
from sklearn.datasets import (load_boston, load_breast_cancer, load_digits,
wxchan's avatar
wxchan committed
14
                              load_iris, load_svmlight_file)
15
from sklearn.metrics import log_loss, mean_absolute_error, mean_squared_error, roc_auc_score
16
from sklearn.model_selection import train_test_split, TimeSeriesSplit, GroupKFold
wxchan's avatar
wxchan committed
17

wxchan's avatar
wxchan committed
18
19
try:
    import cPickle as pickle
wxchan's avatar
wxchan committed
20
except ImportError:
wxchan's avatar
wxchan committed
21
    import pickle
wxchan's avatar
wxchan committed
22

wxchan's avatar
wxchan committed
23

24
25
26
27
28
29
30
decreasing_generator = itertools.count(0, -1)


def dummy_obj(preds, train_data):
    return np.ones(preds.shape), np.ones(preds.shape)


wxchan's avatar
wxchan committed
31
32
33
def multi_logloss(y_true, y_pred):
    return np.mean([-math.log(y_pred[i][y]) for i, y in enumerate(y_true)])

wxchan's avatar
wxchan committed
34

Belinda Trotta's avatar
Belinda Trotta committed
35
36
37
38
39
40
41
def top_k_error(y_true, y_pred, k):
    if k == y_pred.shape[1]:
        return 0
    max_rest = np.max(-np.partition(-y_pred, k)[:, k:], axis=1)
    return 1 - np.mean((y_pred[np.arange(len(y_true)), y_true] > max_rest))


42
43
44
45
46
47
48
49
def constant_metric(preds, train_data):
    return ('error', 0.0, False)


def decreasing_metric(preds, train_data):
    return ('decreasing_metric', next(decreasing_generator), False)


wxchan's avatar
wxchan committed
50
class TestEngine(unittest.TestCase):
wxchan's avatar
wxchan committed
51
    def test_binary(self):
52
53
        X, y = load_breast_cancer(True)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
wxchan's avatar
wxchan committed
54
        params = {
wxchan's avatar
wxchan committed
55
            'objective': 'binary',
56
            'metric': 'binary_logloss',
57
58
            'verbose': -1,
            'num_iteration': 50  # test num_iteration in dict here
wxchan's avatar
wxchan committed
59
        }
60
61
62
63
        lgb_train = lgb.Dataset(X_train, y_train)
        lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
        evals_result = {}
        gbm = lgb.train(params, lgb_train,
64
                        num_boost_round=20,
65
66
67
68
                        valid_sets=lgb_eval,
                        verbose_eval=False,
                        evals_result=evals_result)
        ret = log_loss(y_test, gbm.predict(X_test))
69
        self.assertLess(ret, 0.11)
70
        self.assertEqual(len(evals_result['valid_0']['binary_logloss']), 50)
71
        self.assertAlmostEqual(evals_result['valid_0']['binary_logloss'][-1], ret, places=5)
wxchan's avatar
wxchan committed
72

Guolin Ke's avatar
Guolin Ke committed
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
    def test_rf(self):
        X, y = load_breast_cancer(True)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
        params = {
            'boosting_type': 'rf',
            'objective': 'binary',
            'bagging_freq': 1,
            'bagging_fraction': 0.5,
            'feature_fraction': 0.5,
            'num_leaves': 50,
            'metric': 'binary_logloss',
            'verbose': -1
        }
        lgb_train = lgb.Dataset(X_train, y_train)
        lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
        evals_result = {}
        gbm = lgb.train(params, lgb_train,
                        num_boost_round=50,
                        valid_sets=lgb_eval,
                        verbose_eval=False,
                        evals_result=evals_result)
        ret = log_loss(y_test, gbm.predict(X_test))
95
        self.assertLess(ret, 0.19)
Guolin Ke's avatar
Guolin Ke committed
96
97
        self.assertAlmostEqual(evals_result['valid_0']['binary_logloss'][-1], ret, places=5)

98
    def test_regression(self):
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
        X, y = load_boston(True)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
        params = {
            'metric': 'l2',
            'verbose': -1
        }
        lgb_train = lgb.Dataset(X_train, y_train)
        lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
        evals_result = {}
        gbm = lgb.train(params, lgb_train,
                        num_boost_round=50,
                        valid_sets=lgb_eval,
                        verbose_eval=False,
                        evals_result=evals_result)
        ret = mean_squared_error(y_test, gbm.predict(X_test))
114
        self.assertLess(ret, 7)
115
        self.assertAlmostEqual(evals_result['valid_0']['l2'][-1], ret, places=5)
wxchan's avatar
wxchan committed
116

Guolin Ke's avatar
Guolin Ke committed
117
    def test_missing_value_handle(self):
118
119
120
        X_train = np.zeros((100, 1))
        y_train = np.zeros(100)
        trues = random.sample(range(100), 20)
Guolin Ke's avatar
Guolin Ke committed
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
        for idx in trues:
            X_train[idx, 0] = np.nan
            y_train[idx] = 1
        lgb_train = lgb.Dataset(X_train, y_train)
        lgb_eval = lgb.Dataset(X_train, y_train)

        params = {
            'metric': 'l2',
            'verbose': -1,
            'boost_from_average': False
        }
        evals_result = {}
        gbm = lgb.train(params, lgb_train,
                        num_boost_round=20,
                        valid_sets=lgb_eval,
136
                        verbose_eval=False,
Guolin Ke's avatar
Guolin Ke committed
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
                        evals_result=evals_result)
        ret = mean_squared_error(y_train, gbm.predict(X_train))
        self.assertLess(ret, 0.005)
        self.assertAlmostEqual(evals_result['valid_0']['l2'][-1], ret, places=5)

    def test_missing_value_handle_na(self):
        x = [0, 1, 2, 3, 4, 5, 6, 7, np.nan]
        y = [1, 1, 1, 1, 0, 0, 0, 0, 1]

        X_train = np.array(x).reshape(len(x), 1)
        y_train = np.array(y)
        lgb_train = lgb.Dataset(X_train, y_train)
        lgb_eval = lgb.Dataset(X_train, y_train)

        params = {
ChenZhiyong's avatar
ChenZhiyong committed
152
            'objective': 'regression',
Guolin Ke's avatar
Guolin Ke committed
153
154
155
156
157
158
159
160
161
162
163
164
165
            'metric': 'auc',
            'verbose': -1,
            'boost_from_average': False,
            'min_data': 1,
            'num_leaves': 2,
            'learning_rate': 1,
            'min_data_in_bin': 1,
            'zero_as_missing': False
        }
        evals_result = {}
        gbm = lgb.train(params, lgb_train,
                        num_boost_round=1,
                        valid_sets=lgb_eval,
166
                        verbose_eval=False,
Guolin Ke's avatar
Guolin Ke committed
167
168
                        evals_result=evals_result)
        pred = gbm.predict(X_train)
169
        np.testing.assert_allclose(pred, y)
170
171
172
        ret = roc_auc_score(y_train, pred)
        self.assertGreater(ret, 0.999)
        self.assertAlmostEqual(evals_result['valid_0']['auc'][-1], ret, places=5)
Guolin Ke's avatar
Guolin Ke committed
173
174
175
176
177
178
179
180
181
182
183

    def test_missing_value_handle_zero(self):
        x = [0, 1, 2, 3, 4, 5, 6, 7, np.nan]
        y = [0, 1, 1, 1, 0, 0, 0, 0, 0]

        X_train = np.array(x).reshape(len(x), 1)
        y_train = np.array(y)
        lgb_train = lgb.Dataset(X_train, y_train)
        lgb_eval = lgb.Dataset(X_train, y_train)

        params = {
ChenZhiyong's avatar
ChenZhiyong committed
184
            'objective': 'regression',
Guolin Ke's avatar
Guolin Ke committed
185
186
187
188
189
190
191
192
193
194
195
196
197
            'metric': 'auc',
            'verbose': -1,
            'boost_from_average': False,
            'min_data': 1,
            'num_leaves': 2,
            'learning_rate': 1,
            'min_data_in_bin': 1,
            'zero_as_missing': True
        }
        evals_result = {}
        gbm = lgb.train(params, lgb_train,
                        num_boost_round=1,
                        valid_sets=lgb_eval,
198
                        verbose_eval=False,
Guolin Ke's avatar
Guolin Ke committed
199
200
                        evals_result=evals_result)
        pred = gbm.predict(X_train)
201
        np.testing.assert_allclose(pred, y)
202
203
204
        ret = roc_auc_score(y_train, pred)
        self.assertGreater(ret, 0.999)
        self.assertAlmostEqual(evals_result['valid_0']['auc'][-1], ret, places=5)
Guolin Ke's avatar
Guolin Ke committed
205
206
207
208
209
210
211
212
213
214
215

    def test_missing_value_handle_none(self):
        x = [0, 1, 2, 3, 4, 5, 6, 7, np.nan]
        y = [0, 1, 1, 1, 0, 0, 0, 0, 0]

        X_train = np.array(x).reshape(len(x), 1)
        y_train = np.array(y)
        lgb_train = lgb.Dataset(X_train, y_train)
        lgb_eval = lgb.Dataset(X_train, y_train)

        params = {
ChenZhiyong's avatar
ChenZhiyong committed
216
            'objective': 'regression',
Guolin Ke's avatar
Guolin Ke committed
217
218
219
220
221
222
223
224
225
226
227
228
229
            'metric': 'auc',
            'verbose': -1,
            'boost_from_average': False,
            'min_data': 1,
            'num_leaves': 2,
            'learning_rate': 1,
            'min_data_in_bin': 1,
            'use_missing': False
        }
        evals_result = {}
        gbm = lgb.train(params, lgb_train,
                        num_boost_round=1,
                        valid_sets=lgb_eval,
230
                        verbose_eval=False,
Guolin Ke's avatar
Guolin Ke committed
231
232
                        evals_result=evals_result)
        pred = gbm.predict(X_train)
233
234
        self.assertAlmostEqual(pred[0], pred[1])
        self.assertAlmostEqual(pred[-1], pred[0])
235
236
237
        ret = roc_auc_score(y_train, pred)
        self.assertGreater(ret, 0.83)
        self.assertAlmostEqual(evals_result['valid_0']['auc'][-1], ret, places=5)
Guolin Ke's avatar
Guolin Ke committed
238

ChenZhiyong's avatar
ChenZhiyong committed
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
    def test_categorical_handle(self):
        x = [0, 1, 2, 3, 4, 5, 6, 7]
        y = [0, 1, 0, 1, 0, 1, 0, 1]

        X_train = np.array(x).reshape(len(x), 1)
        y_train = np.array(y)
        lgb_train = lgb.Dataset(X_train, y_train)
        lgb_eval = lgb.Dataset(X_train, y_train)

        params = {
            'objective': 'regression',
            'metric': 'auc',
            'verbose': -1,
            'boost_from_average': False,
            'min_data': 1,
            'num_leaves': 2,
            'learning_rate': 1,
            'min_data_in_bin': 1,
            'min_data_per_group': 1,
258
            'cat_smooth': 1,
Guolin Ke's avatar
Guolin Ke committed
259
            'cat_l2': 0,
260
            'max_cat_to_onehot': 1,
ChenZhiyong's avatar
ChenZhiyong committed
261
262
263
264
265
266
267
            'zero_as_missing': True,
            'categorical_column': 0
        }
        evals_result = {}
        gbm = lgb.train(params, lgb_train,
                        num_boost_round=1,
                        valid_sets=lgb_eval,
268
                        verbose_eval=False,
ChenZhiyong's avatar
ChenZhiyong committed
269
270
                        evals_result=evals_result)
        pred = gbm.predict(X_train)
271
        np.testing.assert_allclose(pred, y)
272
273
274
        ret = roc_auc_score(y_train, pred)
        self.assertGreater(ret, 0.999)
        self.assertAlmostEqual(evals_result['valid_0']['auc'][-1], ret, places=5)
ChenZhiyong's avatar
ChenZhiyong committed
275

276
    def test_categorical_handle_na(self):
Guolin Ke's avatar
Guolin Ke committed
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
        x = [0, np.nan, 0, np.nan, 0, np.nan]
        y = [0, 1, 0, 1, 0, 1]

        X_train = np.array(x).reshape(len(x), 1)
        y_train = np.array(y)
        lgb_train = lgb.Dataset(X_train, y_train)
        lgb_eval = lgb.Dataset(X_train, y_train)

        params = {
            'objective': 'regression',
            'metric': 'auc',
            'verbose': -1,
            'boost_from_average': False,
            'min_data': 1,
            'num_leaves': 2,
            'learning_rate': 1,
            'min_data_in_bin': 1,
            'min_data_per_group': 1,
295
            'cat_smooth': 1,
Guolin Ke's avatar
Guolin Ke committed
296
            'cat_l2': 0,
297
            'max_cat_to_onehot': 1,
Guolin Ke's avatar
Guolin Ke committed
298
299
300
301
302
303
304
            'zero_as_missing': False,
            'categorical_column': 0
        }
        evals_result = {}
        gbm = lgb.train(params, lgb_train,
                        num_boost_round=1,
                        valid_sets=lgb_eval,
305
                        verbose_eval=False,
Guolin Ke's avatar
Guolin Ke committed
306
307
                        evals_result=evals_result)
        pred = gbm.predict(X_train)
308
        np.testing.assert_allclose(pred, y)
309
310
311
        ret = roc_auc_score(y_train, pred)
        self.assertGreater(ret, 0.999)
        self.assertAlmostEqual(evals_result['valid_0']['auc'][-1], ret, places=5)
Guolin Ke's avatar
Guolin Ke committed
312

wxchan's avatar
wxchan committed
313
    def test_multiclass(self):
314
315
        X, y = load_digits(10, True)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
wxchan's avatar
wxchan committed
316
        params = {
wxchan's avatar
wxchan committed
317
318
            'objective': 'multiclass',
            'metric': 'multi_logloss',
319
320
            'num_class': 10,
            'verbose': -1
wxchan's avatar
wxchan committed
321
        }
322
323
324
325
326
327
328
329
330
        lgb_train = lgb.Dataset(X_train, y_train, params=params)
        lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train, params=params)
        evals_result = {}
        gbm = lgb.train(params, lgb_train,
                        num_boost_round=50,
                        valid_sets=lgb_eval,
                        verbose_eval=False,
                        evals_result=evals_result)
        ret = multi_logloss(y_test, gbm.predict(X_test))
331
        self.assertLess(ret, 0.15)
332
        self.assertAlmostEqual(evals_result['valid_0']['multi_logloss'][-1], ret, places=5)
wxchan's avatar
wxchan committed
333

334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
    def test_multiclass_rf(self):
        X, y = load_digits(10, True)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
        params = {
            'boosting_type': 'rf',
            'objective': 'multiclass',
            'metric': 'multi_logloss',
            'bagging_freq': 1,
            'bagging_fraction': 0.6,
            'feature_fraction': 0.6,
            'num_class': 10,
            'num_leaves': 50,
            'min_data': 1,
            'verbose': -1
        }
        lgb_train = lgb.Dataset(X_train, y_train, params=params)
        lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train, params=params)
        evals_result = {}
        gbm = lgb.train(params, lgb_train,
353
                        num_boost_round=50,
354
355
356
357
                        valid_sets=lgb_eval,
                        verbose_eval=False,
                        evals_result=evals_result)
        ret = multi_logloss(y_test, gbm.predict(X_test))
358
        self.assertLess(ret, 0.23)
359
360
        self.assertAlmostEqual(evals_result['valid_0']['multi_logloss'][-1], ret, places=5)

cbecker's avatar
cbecker committed
361
362
363
364
365
366
367
368
369
370
371
    def test_multiclass_prediction_early_stopping(self):
        X, y = load_digits(10, True)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
        params = {
            'objective': 'multiclass',
            'metric': 'multi_logloss',
            'num_class': 10,
            'verbose': -1
        }
        lgb_train = lgb.Dataset(X_train, y_train, params=params)
        gbm = lgb.train(params, lgb_train,
372
                        num_boost_round=50)
cbecker's avatar
cbecker committed
373

374
375
376
        pred_parameter = {"pred_early_stop": True,
                          "pred_early_stop_freq": 5,
                          "pred_early_stop_margin": 1.5}
377
        ret = multi_logloss(y_test, gbm.predict(X_test, **pred_parameter))
cbecker's avatar
cbecker committed
378
        self.assertLess(ret, 0.8)
379
        self.assertGreater(ret, 0.6)  # loss will be higher than when evaluating the full model
cbecker's avatar
cbecker committed
380

381
382
383
        pred_parameter = {"pred_early_stop": True,
                          "pred_early_stop_freq": 5,
                          "pred_early_stop_margin": 5.5}
384
        ret = multi_logloss(y_test, gbm.predict(X_test, **pred_parameter))
cbecker's avatar
cbecker committed
385
386
        self.assertLess(ret, 0.2)

Belinda Trotta's avatar
Belinda Trotta committed
387
    def test_multi_class_error(self):
388
389
390
        X, y = load_digits(10, True)
        params = {'objective': 'multiclass', 'num_classes': 10, 'metric': 'multi_error',
                  'num_leaves': 4, 'verbose': -1}
Belinda Trotta's avatar
Belinda Trotta committed
391
        lgb_data = lgb.Dataset(X, label=y)
392
        est = lgb.train(params, lgb_data, num_boost_round=10)
Belinda Trotta's avatar
Belinda Trotta committed
393
394
        predict_default = est.predict(X)
        results = {}
395
396
        est = lgb.train(dict(params, multi_error_top_k=1), lgb_data, num_boost_round=10,
                        valid_sets=[lgb_data], evals_result=results, verbose_eval=False)
Belinda Trotta's avatar
Belinda Trotta committed
397
398
        predict_1 = est.predict(X)
        # check that default gives same result as k = 1
399
        np.testing.assert_allclose(predict_1, predict_default)
Belinda Trotta's avatar
Belinda Trotta committed
400
401
        # check against independent calculation for k = 1
        err = top_k_error(y, predict_1, 1)
402
        self.assertAlmostEqual(results['training']['multi_error'][-1], err)
Belinda Trotta's avatar
Belinda Trotta committed
403
404
        # check against independent calculation for k = 2
        results = {}
405
406
        est = lgb.train(dict(params, multi_error_top_k=2), lgb_data, num_boost_round=10,
                        valid_sets=[lgb_data], evals_result=results, verbose_eval=False)
Belinda Trotta's avatar
Belinda Trotta committed
407
408
        predict_2 = est.predict(X)
        err = top_k_error(y, predict_2, 2)
409
        self.assertAlmostEqual(results['training']['multi_error@2'][-1], err)
Belinda Trotta's avatar
Belinda Trotta committed
410
411
        # check against independent calculation for k = 10
        results = {}
412
413
414
415
416
417
        est = lgb.train(dict(params, multi_error_top_k=10), lgb_data, num_boost_round=10,
                        valid_sets=[lgb_data], evals_result=results, verbose_eval=False)
        predict_3 = est.predict(X)
        err = top_k_error(y, predict_3, 10)
        self.assertAlmostEqual(results['training']['multi_error@10'][-1], err)
        # check cases where predictions are equal
Belinda Trotta's avatar
Belinda Trotta committed
418
419
420
        X = np.array([[0, 0], [0, 0]])
        y = np.array([0, 1])
        lgb_data = lgb.Dataset(X, label=y)
421
        params['num_classes'] = 2
Belinda Trotta's avatar
Belinda Trotta committed
422
        results = {}
423
424
425
        lgb.train(params, lgb_data, num_boost_round=10,
                  valid_sets=[lgb_data], evals_result=results, verbose_eval=False)
        self.assertAlmostEqual(results['training']['multi_error'][-1], 1)
Belinda Trotta's avatar
Belinda Trotta committed
426
        results = {}
427
428
429
        lgb.train(dict(params, multi_error_top_k=2), lgb_data, num_boost_round=10,
                  valid_sets=[lgb_data], evals_result=results, verbose_eval=False)
        self.assertAlmostEqual(results['training']['multi_error@2'][-1], 0)
Belinda Trotta's avatar
Belinda Trotta committed
430

431
    def test_early_stopping(self):
432
        X, y = load_breast_cancer(True)
433
434
435
        params = {
            'objective': 'binary',
            'metric': 'binary_logloss',
436
            'verbose': -1
437
        }
438
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
439
440
        lgb_train = lgb.Dataset(X_train, y_train)
        lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
wxchan's avatar
wxchan committed
441
        valid_set_name = 'valid_set'
442
443
444
445
        # no early stopping
        gbm = lgb.train(params, lgb_train,
                        num_boost_round=10,
                        valid_sets=lgb_eval,
wxchan's avatar
wxchan committed
446
                        valid_names=valid_set_name,
447
448
                        verbose_eval=False,
                        early_stopping_rounds=5)
449
        self.assertEqual(gbm.best_iteration, 10)
wxchan's avatar
wxchan committed
450
451
        self.assertIn(valid_set_name, gbm.best_score)
        self.assertIn('binary_logloss', gbm.best_score[valid_set_name])
452
453
        # early stopping occurs
        gbm = lgb.train(params, lgb_train,
454
                        num_boost_round=40,
455
                        valid_sets=lgb_eval,
wxchan's avatar
wxchan committed
456
                        valid_names=valid_set_name,
457
458
                        verbose_eval=False,
                        early_stopping_rounds=5)
459
        self.assertLessEqual(gbm.best_iteration, 31)
wxchan's avatar
wxchan committed
460
461
        self.assertIn(valid_set_name, gbm.best_score)
        self.assertIn('binary_logloss', gbm.best_score[valid_set_name])
462

463
    def test_continue_train(self):
464
465
        X, y = load_boston(True)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
wxchan's avatar
wxchan committed
466
        params = {
wxchan's avatar
wxchan committed
467
            'objective': 'regression',
468
469
            'metric': 'l1',
            'verbose': -1
wxchan's avatar
wxchan committed
470
        }
471
472
473
        lgb_train = lgb.Dataset(X_train, y_train, free_raw_data=False)
        lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train, free_raw_data=False)
        init_gbm = lgb.train(params, lgb_train, num_boost_round=20)
wxchan's avatar
wxchan committed
474
        model_name = 'model.txt'
475
476
477
478
479
480
481
        init_gbm.save_model(model_name)
        evals_result = {}
        gbm = lgb.train(params, lgb_train,
                        num_boost_round=30,
                        valid_sets=lgb_eval,
                        verbose_eval=False,
                        # test custom eval metrics
482
                        feval=(lambda p, d: ('custom_mae', mean_absolute_error(p, d.get_label()), False)),
483
484
485
                        evals_result=evals_result,
                        init_model='model.txt')
        ret = mean_absolute_error(y_test, gbm.predict(X_test))
486
        self.assertLess(ret, 2.0)
487
        self.assertAlmostEqual(evals_result['valid_0']['l1'][-1], ret, places=5)
488
        np.testing.assert_allclose(evals_result['valid_0']['l1'], evals_result['valid_0']['custom_mae'])
wxchan's avatar
wxchan committed
489
490
        os.remove(model_name)

491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
    def test_continue_train_dart(self):
        X, y = load_boston(True)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
        params = {
            'boosting_type': 'dart',
            'objective': 'regression',
            'metric': 'l1',
            'verbose': -1
        }
        lgb_train = lgb.Dataset(X_train, y_train, free_raw_data=False)
        lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train, free_raw_data=False)
        init_gbm = lgb.train(params, lgb_train, num_boost_round=50)
        evals_result = {}
        gbm = lgb.train(params, lgb_train,
                        num_boost_round=50,
                        valid_sets=lgb_eval,
                        verbose_eval=False,
                        evals_result=evals_result,
                        init_model=init_gbm)
        ret = mean_absolute_error(y_test, gbm.predict(X_test))
511
        self.assertLess(ret, 2.0)
512
513
        self.assertAlmostEqual(evals_result['valid_0']['l1'][-1], ret, places=5)

wxchan's avatar
wxchan committed
514
    def test_continue_train_multiclass(self):
515
516
        X, y = load_iris(True)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
wxchan's avatar
wxchan committed
517
        params = {
wxchan's avatar
wxchan committed
518
519
            'objective': 'multiclass',
            'metric': 'multi_logloss',
520
521
            'num_class': 3,
            'verbose': -1
wxchan's avatar
wxchan committed
522
        }
523
524
525
526
527
528
529
530
531
532
533
        lgb_train = lgb.Dataset(X_train, y_train, params=params, free_raw_data=False)
        lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train, params=params, free_raw_data=False)
        init_gbm = lgb.train(params, lgb_train, num_boost_round=20)
        evals_result = {}
        gbm = lgb.train(params, lgb_train,
                        num_boost_round=30,
                        valid_sets=lgb_eval,
                        verbose_eval=False,
                        evals_result=evals_result,
                        init_model=init_gbm)
        ret = multi_logloss(y_test, gbm.predict(X_test))
534
        self.assertLess(ret, 0.1)
535
        self.assertAlmostEqual(evals_result['valid_0']['multi_logloss'][-1], ret, places=5)
wxchan's avatar
wxchan committed
536
537

    def test_cv(self):
538
        X_train, y_train = load_boston(True)
539
540
541
542
        params = {'verbose': -1}
        lgb_train = lgb.Dataset(X_train, y_train)
        # shuffle = False, override metric in params
        params_with_metric = {'metric': 'l2', 'verbose': -1}
543
544
        cv_res = lgb.cv(params_with_metric, lgb_train, num_boost_round=10,
                        nfold=3, stratified=False, shuffle=False,
545
546
547
548
                        metrics='l1', verbose_eval=False)
        self.assertIn('l1-mean', cv_res)
        self.assertNotIn('l2-mean', cv_res)
        self.assertEqual(len(cv_res['l1-mean']), 10)
wxchan's avatar
wxchan committed
549
        # shuffle = True, callbacks
550
551
552
553
554
        cv_res = lgb.cv(params, lgb_train, num_boost_round=10, nfold=3, stratified=False, shuffle=True,
                        metrics='l1', verbose_eval=False,
                        callbacks=[lgb.reset_parameter(learning_rate=lambda i: 0.1 - 0.001 * i)])
        self.assertIn('l1-mean', cv_res)
        self.assertEqual(len(cv_res['l1-mean']), 10)
555
556
557
558
559
560
561
562
563
564
        # enable display training loss
        cv_res = lgb.cv(params_with_metric, lgb_train, num_boost_round=10,
                        nfold=3, stratified=False, shuffle=False,
                        metrics='l1', verbose_eval=False, eval_train_metric=True)
        self.assertIn('train l1-mean', cv_res)
        self.assertIn('valid l1-mean', cv_res)
        self.assertNotIn('train l2-mean', cv_res)
        self.assertNotIn('valid l2-mean', cv_res)
        self.assertEqual(len(cv_res['train l1-mean']), 10)
        self.assertEqual(len(cv_res['valid l1-mean']), 10)
565
        # self defined folds
wxchan's avatar
wxchan committed
566
        tss = TimeSeriesSplit(3)
567
        folds = tss.split(X_train)
568
569
570
571
        cv_res_gen = lgb.cv(params_with_metric, lgb_train, num_boost_round=10, folds=folds,
                            verbose_eval=False)
        cv_res_obj = lgb.cv(params_with_metric, lgb_train, num_boost_round=10, folds=tss,
                            verbose_eval=False)
572
        np.testing.assert_allclose(cv_res_gen['l2-mean'], cv_res_obj['l2-mean'])
wxchan's avatar
wxchan committed
573
        # lambdarank
574
575
576
577
        X_train, y_train = load_svmlight_file(os.path.join(os.path.dirname(os.path.realpath(__file__)),
                                                           '../../examples/lambdarank/rank.train'))
        q_train = np.loadtxt(os.path.join(os.path.dirname(os.path.realpath(__file__)),
                                          '../../examples/lambdarank/rank.train.query'))
578
        params_lambdarank = {'objective': 'lambdarank', 'verbose': -1, 'eval_at': 3}
579
        lgb_train = lgb.Dataset(X_train, y_train, group=q_train)
580
        # ... with l2 metric
581
582
583
584
        cv_res_lambda = lgb.cv(params_lambdarank, lgb_train, num_boost_round=10, nfold=3,
                               metrics='l2', verbose_eval=False)
        self.assertEqual(len(cv_res_lambda), 2)
        self.assertFalse(np.isnan(cv_res_lambda['l2-mean']).any())
585
586
587
588
589
        # ... with NDCG (default) metric
        cv_res_lambda = lgb.cv(params_lambdarank, lgb_train, num_boost_round=10, nfold=3,
                               verbose_eval=False)
        self.assertEqual(len(cv_res_lambda), 2)
        self.assertFalse(np.isnan(cv_res_lambda['ndcg@3-mean']).any())
590
591
592
        # self defined folds with lambdarank
        cv_res_lambda_obj = lgb.cv(params_lambdarank, lgb_train, num_boost_round=10,
                                   folds=GroupKFold(n_splits=3),
593
                                   verbose_eval=False)
594
        np.testing.assert_allclose(cv_res_lambda['ndcg@3-mean'], cv_res_lambda_obj['ndcg@3-mean'])
wxchan's avatar
wxchan committed
595

wxchan's avatar
wxchan committed
596
    def test_feature_name(self):
597
        X_train, y_train = load_boston(True)
598
599
        params = {'verbose': -1}
        lgb_train = lgb.Dataset(X_train, y_train)
600
        feature_names = ['f_' + str(i) for i in range(X_train.shape[-1])]
601
        gbm = lgb.train(params, lgb_train, num_boost_round=5, feature_name=feature_names)
602
603
        self.assertListEqual(feature_names, gbm.feature_name())
        # test feature_names with whitespaces
604
        feature_names_with_space = ['f ' + str(i) for i in range(X_train.shape[-1])]
605
        gbm = lgb.train(params, lgb_train, num_boost_round=5, feature_name=feature_names_with_space)
wxchan's avatar
wxchan committed
606
607
        self.assertListEqual(feature_names, gbm.feature_name())

wxchan's avatar
wxchan committed
608
    def test_save_load_copy_pickle(self):
609
        def train_and_predict(init_model=None, return_model=False):
610
            X, y = load_boston(True)
611
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
612
613
614
615
616
617
618
619
            params = {
                'objective': 'regression',
                'metric': 'l2',
                'verbose': -1
            }
            lgb_train = lgb.Dataset(X_train, y_train)
            gbm_template = lgb.train(params, lgb_train, num_boost_round=10, init_model=init_model)
            return gbm_template if return_model else mean_squared_error(y_test, gbm_template.predict(X_test))
620
621
622

        gbm = train_and_predict(return_model=True)
        ret_origin = train_and_predict(init_model=gbm)
wxchan's avatar
wxchan committed
623
624
        other_ret = []
        gbm.save_model('lgb.model')
625
        other_ret.append(train_and_predict(init_model='lgb.model'))
wxchan's avatar
wxchan committed
626
        gbm_load = lgb.Booster(model_file='lgb.model')
627
628
629
        other_ret.append(train_and_predict(init_model=gbm_load))
        other_ret.append(train_and_predict(init_model=copy.copy(gbm)))
        other_ret.append(train_and_predict(init_model=copy.deepcopy(gbm)))
wxchan's avatar
wxchan committed
630
631
632
633
        with open('lgb.pkl', 'wb') as f:
            pickle.dump(gbm, f)
        with open('lgb.pkl', 'rb') as f:
            gbm_pickle = pickle.load(f)
634
        other_ret.append(train_and_predict(init_model=gbm_pickle))
wxchan's avatar
wxchan committed
635
        gbm_pickles = pickle.loads(pickle.dumps(gbm))
636
        other_ret.append(train_and_predict(init_model=gbm_pickles))
wxchan's avatar
wxchan committed
637
638
        for ret in other_ret:
            self.assertAlmostEqual(ret_origin, ret, places=5)
wxchan's avatar
wxchan committed
639

640
    @unittest.skipIf(not lgb.compat.PANDAS_INSTALLED, 'pandas is not installed')
641
    def test_pandas_categorical(self):
642
        import pandas as pd
643
        np.random.seed(42)  # sometimes there is no difference how cols are treated (cat or not cat)
644
645
646
        X = pd.DataFrame({"A": np.random.permutation(['a', 'b', 'c', 'd'] * 75),  # str
                          "B": np.random.permutation([1, 2, 3] * 100),  # int
                          "C": np.random.permutation([0.1, 0.2, -0.1, -0.1, 0.2] * 60),  # float
647
648
649
                          "D": np.random.permutation([True, False] * 150),  # bool
                          "E": pd.Categorical(np.random.permutation(['z', 'y', 'x', 'w', 'v'] * 60),
                                              ordered=True)})  # str and ordered categorical
650
        y = np.random.permutation([0, 1] * 150)
651
        X_test = pd.DataFrame({"A": np.random.permutation(['a', 'b', 'e'] * 20),  # unseen category
652
653
                               "B": np.random.permutation([1, 3] * 30),
                               "C": np.random.permutation([0.1, -0.1, 0.2, 0.2] * 15),
654
655
656
657
658
659
660
661
662
                               "D": np.random.permutation([True, False] * 30),
                               "E": pd.Categorical(pd.np.random.permutation(['z', 'y'] * 30),
                                                   ordered=True)})
        np.random.seed()  # reset seed
        cat_cols_actual = ["A", "B", "C", "D"]
        cat_cols_to_store = cat_cols_actual + ["E"]
        X[cat_cols_actual] = X[cat_cols_actual].astype('category')
        X_test[cat_cols_actual] = X_test[cat_cols_actual].astype('category')
        cat_values = [X[col].cat.categories.tolist() for col in cat_cols_to_store]
663
664
665
666
667
668
        params = {
            'objective': 'binary',
            'metric': 'binary_logloss',
            'verbose': -1
        }
        lgb_train = lgb.Dataset(X, y)
669
        gbm0 = lgb.train(params, lgb_train, num_boost_round=10)
670
        pred0 = gbm0.predict(X_test)
671
        self.assertEqual(lgb_train.categorical_feature, 'auto')
672
        lgb_train = lgb.Dataset(X, pd.DataFrame(y))  # also test that label can be one-column pd.DataFrame
673
        gbm1 = lgb.train(params, lgb_train, num_boost_round=10, categorical_feature=[0])
674
        pred1 = gbm1.predict(X_test)
675
        self.assertListEqual(lgb_train.categorical_feature, [0])
676
        lgb_train = lgb.Dataset(X, pd.Series(y))  # also test that label can be pd.Series
677
        gbm2 = lgb.train(params, lgb_train, num_boost_round=10, categorical_feature=['A'])
678
        pred2 = gbm2.predict(X_test)
679
        self.assertListEqual(lgb_train.categorical_feature, ['A'])
680
        lgb_train = lgb.Dataset(X, y)
681
        gbm3 = lgb.train(params, lgb_train, num_boost_round=10, categorical_feature=['A', 'B', 'C', 'D'])
682
        pred3 = gbm3.predict(X_test)
683
        self.assertListEqual(lgb_train.categorical_feature, ['A', 'B', 'C', 'D'])
684
685
        gbm3.save_model('categorical.model')
        gbm4 = lgb.Booster(model_file='categorical.model')
686
687
688
689
        pred4 = gbm4.predict(X_test)
        model_str = gbm4.model_to_string()
        gbm4.model_from_string(model_str, False)
        pred5 = gbm4.predict(X_test)
690
        gbm5 = lgb.Booster(model_str=model_str)
691
        pred6 = gbm5.predict(X_test)
692
        lgb_train = lgb.Dataset(X, y)
693
        gbm6 = lgb.train(params, lgb_train, num_boost_round=10, categorical_feature=['A', 'B', 'C', 'D', 'E'])
694
        pred7 = gbm6.predict(X_test)
695
        self.assertListEqual(lgb_train.categorical_feature, ['A', 'B', 'C', 'D', 'E'])
696
697
698
699
700
        lgb_train = lgb.Dataset(X, y)
        gbm7 = lgb.train(params, lgb_train, num_boost_round=10, categorical_feature=[])
        pred8 = gbm7.predict(X_test)
        self.assertListEqual(lgb_train.categorical_feature, [])
        self.assertRaises(AssertionError,
701
                          np.testing.assert_allclose,
702
703
                          pred0, pred1)
        self.assertRaises(AssertionError,
704
                          np.testing.assert_allclose,
705
                          pred0, pred2)
706
707
708
709
710
        np.testing.assert_allclose(pred1, pred2)
        np.testing.assert_allclose(pred0, pred3)
        np.testing.assert_allclose(pred0, pred4)
        np.testing.assert_allclose(pred0, pred5)
        np.testing.assert_allclose(pred0, pred6)
711
        self.assertRaises(AssertionError,
712
                          np.testing.assert_allclose,
713
                          pred0, pred7)  # ordered cat features aren't treated as cat features by default
714
        self.assertRaises(AssertionError,
715
                          np.testing.assert_allclose,
716
                          pred0, pred8)
717
718
719
720
721
722
723
        self.assertListEqual(gbm0.pandas_categorical, cat_values)
        self.assertListEqual(gbm1.pandas_categorical, cat_values)
        self.assertListEqual(gbm2.pandas_categorical, cat_values)
        self.assertListEqual(gbm3.pandas_categorical, cat_values)
        self.assertListEqual(gbm4.pandas_categorical, cat_values)
        self.assertListEqual(gbm5.pandas_categorical, cat_values)
        self.assertListEqual(gbm6.pandas_categorical, cat_values)
724
        self.assertListEqual(gbm7.pandas_categorical, cat_values)
725

726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
    @unittest.skipIf(not lgb.compat.PANDAS_INSTALLED, 'pandas is not installed')
    def test_pandas_sparse(self):
        import pandas as pd
        X = pd.DataFrame({"A": pd.SparseArray(np.random.permutation([0, 1, 2] * 100)),
                          "B": pd.SparseArray(np.random.permutation([0.0, 0.1, 0.2, -0.1, 0.2] * 60)),
                          "C": pd.SparseArray(np.random.permutation([True, False] * 150))})
        y = pd.Series(pd.SparseArray(np.random.permutation([0, 1] * 150)))
        X_test = pd.DataFrame({"A": pd.SparseArray(np.random.permutation([0, 2] * 30)),
                               "B": pd.SparseArray(np.random.permutation([0.0, 0.1, 0.2, -0.1] * 15)),
                               "C": pd.SparseArray(np.random.permutation([True, False] * 30))})
        if pd.__version__ >= '0.24.0':
            for dtype in pd.concat([X.dtypes, X_test.dtypes, pd.Series(y.dtypes)]):
                self.assertTrue(pd.api.types.is_sparse(dtype))
        params = {
            'objective': 'binary',
            'verbose': -1
        }
        lgb_train = lgb.Dataset(X, y)
        gbm = lgb.train(params, lgb_train, num_boost_round=10)
        pred_sparse = gbm.predict(X_test, raw_score=True)
        if hasattr(X_test, 'sparse'):
            pred_dense = gbm.predict(X_test.sparse.to_dense(), raw_score=True)
        else:
            pred_dense = gbm.predict(X_test.to_dense(), raw_score=True)
        np.testing.assert_allclose(pred_sparse, pred_dense)

752
753
754
    def test_reference_chain(self):
        X = np.random.normal(size=(100, 2))
        y = np.random.normal(size=100)
755
756
        tmp_dat = lgb.Dataset(X, y)
        # take subsets and train
757
758
        tmp_dat_train = tmp_dat.subset(np.arange(80))
        tmp_dat_val = tmp_dat.subset(np.arange(80, 100)).subset(np.arange(18))
759
        params = {'objective': 'regression_l2', 'metric': 'rmse'}
760
761
        evals_result = {}
        gbm = lgb.train(params, tmp_dat_train, num_boost_round=20,
762
763
                        valid_sets=[tmp_dat_train, tmp_dat_val],
                        verbose_eval=False, evals_result=evals_result)
764
765
        self.assertEqual(len(evals_result['training']['rmse']), 20)
        self.assertEqual(len(evals_result['valid_1']['rmse']), 20)
766
767
768
769
770
771
772
773
774
775

    def test_contribs(self):
        X, y = load_breast_cancer(True)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
        params = {
            'objective': 'binary',
            'metric': 'binary_logloss',
            'verbose': -1,
        }
        lgb_train = lgb.Dataset(X_train, y_train)
776
        gbm = lgb.train(params, lgb_train, num_boost_round=20)
777

778
779
        self.assertLess(np.linalg.norm(gbm.predict(X_test, raw_score=True)
                                       - np.sum(gbm.predict(X_test, pred_contrib=True), axis=1)), 1e-4)
780
781
782
783
784
785
786
787
788

    def test_sliced_data(self):
        def train_and_get_predictions(features, labels):
            dataset = lgb.Dataset(features, label=labels)
            lgb_params = {
                'application': 'binary',
                'verbose': -1,
                'min_data': 5,
            }
789
            gbm = lgb.train(
790
791
792
793
                params=lgb_params,
                train_set=dataset,
                num_boost_round=10,
            )
794
795
            return gbm.predict(features)

796
797
798
        num_samples = 100
        features = np.random.rand(num_samples, 5)
        positive_samples = int(num_samples * 0.25)
799
800
        labels = np.append(np.ones(positive_samples, dtype=np.float32),
                           np.zeros(num_samples - positive_samples, dtype=np.float32))
801
802
803
804
805
        # test sliced labels
        origin_pred = train_and_get_predictions(features, labels)
        stacked_labels = np.column_stack((labels, np.ones(num_samples, dtype=np.float32)))
        sliced_labels = stacked_labels[:, 0]
        sliced_pred = train_and_get_predictions(features, sliced_labels)
806
        np.testing.assert_allclose(origin_pred, sliced_pred)
807
808
809
810
811
812
813
814
815
816
817
        # append some columns
        stacked_features = np.column_stack((np.ones(num_samples, dtype=np.float32), features))
        stacked_features = np.column_stack((np.ones(num_samples, dtype=np.float32), stacked_features))
        stacked_features = np.column_stack((stacked_features, np.ones(num_samples, dtype=np.float32)))
        stacked_features = np.column_stack((stacked_features, np.ones(num_samples, dtype=np.float32)))
        # append some rows
        stacked_features = np.concatenate((np.ones(9, dtype=np.float32).reshape((1, 9)), stacked_features), axis=0)
        stacked_features = np.concatenate((np.ones(9, dtype=np.float32).reshape((1, 9)), stacked_features), axis=0)
        stacked_features = np.concatenate((stacked_features, np.ones(9, dtype=np.float32).reshape((1, 9))), axis=0)
        stacked_features = np.concatenate((stacked_features, np.ones(9, dtype=np.float32).reshape((1, 9))), axis=0)
        # test sliced 2d matrix
818
819
        sliced_features = stacked_features[2:102, 2:7]
        self.assertTrue(np.all(sliced_features == features))
820
        sliced_pred = train_and_get_predictions(sliced_features, sliced_labels)
821
        np.testing.assert_allclose(origin_pred, sliced_pred)
822
823
        # test sliced CSR
        stacked_csr = csr_matrix(stacked_features)
824
825
        sliced_csr = stacked_csr[2:102, 2:7]
        self.assertTrue(np.all(sliced_csr == features))
826
        sliced_pred = train_and_get_predictions(sliced_csr, sliced_labels)
827
        np.testing.assert_allclose(origin_pred, sliced_pred)
Guolin Ke's avatar
Guolin Ke committed
828

Guolin Ke's avatar
Guolin Ke committed
829
    def test_init_with_subset(self):
830
831
        data = np.random.random((50, 2))
        y = [1] * 25 + [0] * 25
Guolin Ke's avatar
Guolin Ke committed
832
        lgb_train = lgb.Dataset(data, y, free_raw_data=False)
833
        subset_index_1 = np.random.choice(np.arange(50), 30, replace=False)
Guolin Ke's avatar
Guolin Ke committed
834
        subset_data_1 = lgb_train.subset(subset_index_1)
835
        subset_index_2 = np.random.choice(np.arange(50), 20, replace=False)
Guolin Ke's avatar
Guolin Ke committed
836
837
838
839
840
841
842
843
844
845
846
847
848
        subset_data_2 = lgb_train.subset(subset_index_2)
        params = {
            'objective': 'binary',
            'verbose': -1
        }
        init_gbm = lgb.train(params=params,
                             train_set=subset_data_1,
                             num_boost_round=10,
                             keep_training_booster=True)
        gbm = lgb.train(params=params,
                        train_set=subset_data_2,
                        num_boost_round=10,
                        init_model=init_gbm)
849
850
851
        self.assertEqual(lgb_train.get_data().shape[0], 50)
        self.assertEqual(subset_data_1.get_data().shape[0], 30)
        self.assertEqual(subset_data_2.get_data().shape[0], 20)
Guolin Ke's avatar
Guolin Ke committed
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
        lgb_train.save_binary("lgb_train_data.bin")
        lgb_train_from_file = lgb.Dataset('lgb_train_data.bin', free_raw_data=False)
        subset_data_3 = lgb_train_from_file.subset(subset_index_1)
        subset_data_4 = lgb_train_from_file.subset(subset_index_2)
        init_gbm_2 = lgb.train(params=params,
                               train_set=subset_data_3,
                               num_boost_round=10,
                               keep_training_booster=True)
        with np.testing.assert_raises_regex(lgb.basic.LightGBMError, "Unknown format of training data"):
            gbm = lgb.train(params=params,
                            train_set=subset_data_4,
                            num_boost_round=10,
                            init_model=init_gbm_2)
        self.assertEqual(lgb_train_from_file.get_data(), "lgb_train_data.bin")
        self.assertEqual(subset_data_3.get_data(), "lgb_train_data.bin")
        self.assertEqual(subset_data_4.get_data(), "lgb_train_data.bin")

Guolin Ke's avatar
Guolin Ke committed
869
870
    def test_monotone_constraint(self):
        def is_increasing(y):
871
            return (np.diff(y) >= 0.0).all()
Guolin Ke's avatar
Guolin Ke committed
872
873

        def is_decreasing(y):
874
            return (np.diff(y) <= 0.0).all()
Guolin Ke's avatar
Guolin Ke committed
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889

        def is_correctly_constrained(learner):
            n = 200
            variable_x = np.linspace(0, 1, n).reshape((n, 1))
            fixed_xs_values = np.linspace(0, 1, n)
            for i in range(n):
                fixed_x = fixed_xs_values[i] * np.ones((n, 1))
                monotonically_increasing_x = np.column_stack((variable_x, fixed_x))
                monotonically_increasing_y = learner.predict(monotonically_increasing_x)
                monotonically_decreasing_x = np.column_stack((fixed_x, variable_x))
                monotonically_decreasing_y = learner.predict(monotonically_decreasing_x)
                if not (is_increasing(monotonically_increasing_y) and is_decreasing(monotonically_decreasing_y)):
                    return False
            return True

890
        number_of_dpoints = 2000
Guolin Ke's avatar
Guolin Ke committed
891
892
893
894
        x1_positively_correlated_with_y = np.random.random(size=number_of_dpoints)
        x2_negatively_correlated_with_y = np.random.random(size=number_of_dpoints)
        x = np.column_stack((x1_positively_correlated_with_y, x2_negatively_correlated_with_y))
        zs = np.random.normal(loc=0.0, scale=0.01, size=number_of_dpoints)
895
896
897
898
899
        y = (5 * x1_positively_correlated_with_y
             + np.sin(10 * np.pi * x1_positively_correlated_with_y)
             - 5 * x2_negatively_correlated_with_y
             - np.cos(10 * np.pi * x2_negatively_correlated_with_y)
             + zs)
Guolin Ke's avatar
Guolin Ke committed
900
901
902
903
904
905
906
        trainset = lgb.Dataset(x, label=y)
        params = {
            'min_data': 20,
            'num_leaves': 20,
            'monotone_constraints': '1,-1'
        }
        constrained_model = lgb.train(params, trainset)
907
        self.assertTrue(is_correctly_constrained(constrained_model))
Guolin Ke's avatar
Guolin Ke committed
908

Belinda Trotta's avatar
Belinda Trotta committed
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
    def test_max_bin_by_feature(self):
        col1 = np.arange(0, 100)[:, np.newaxis]
        col2 = np.zeros((100, 1))
        col2[20:] = 1
        X = np.concatenate([col1, col2], axis=1)
        y = np.arange(0, 100)
        params = {
            'objective': 'regression_l2',
            'verbose': -1,
            'num_leaves': 100,
            'min_data_in_leaf': 1,
            'min_sum_hessian_in_leaf': 0,
            'min_data_in_bin': 1,
            'max_bin_by_feature': [100, 2]
        }
        lgb_data = lgb.Dataset(X, label=y)
        est = lgb.train(params, lgb_data, num_boost_round=1)
926
        self.assertEqual(len(np.unique(est.predict(X))), 100)
Belinda Trotta's avatar
Belinda Trotta committed
927
928
929
930
931
        params['max_bin_by_feature'] = [2, 100]
        lgb_data = lgb.Dataset(X, label=y)
        est = lgb.train(params, lgb_data, num_boost_round=1)
        self.assertEqual(len(np.unique(est.predict(X))), 3)

932
933
934
935
936
937
938
939
940
941
942
943
944
    def test_small_max_bin(self):
        np.random.seed(0)
        y = np.random.choice([0, 1], 100)
        x = np.zeros((100, 1))
        x[:30, 0] = -1
        x[30:60, 0] = 1
        x[60:, 0] = 2
        params = {'objective': 'binary',
                  'seed': 0,
                  'min_data_in_leaf': 1,
                  'verbose': -1,
                  'max_bin': 2}
        lgb_x = lgb.Dataset(x, label=y)
945
        lgb.train(params, lgb_x, num_boost_round=5)
946
947
948
        x[0, 0] = np.nan
        params['max_bin'] = 3
        lgb_x = lgb.Dataset(x, label=y)
949
        lgb.train(params, lgb_x, num_boost_round=5)
950
951
        np.random.seed()  # reset seed

Guolin Ke's avatar
Guolin Ke committed
952
953
954
955
956
957
958
959
960
961
    def test_refit(self):
        X, y = load_breast_cancer(True)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
        params = {
            'objective': 'binary',
            'metric': 'binary_logloss',
            'verbose': -1,
            'min_data': 10
        }
        lgb_train = lgb.Dataset(X_train, y_train)
962
        gbm = lgb.train(params, lgb_train, num_boost_round=20)
Guolin Ke's avatar
Guolin Ke committed
963
964
965
966
        err_pred = log_loss(y_test, gbm.predict(X_test))
        new_gbm = gbm.refit(X_test, y_test)
        new_err_pred = log_loss(y_test, new_gbm.predict(X_test))
        self.assertGreater(err_pred, new_err_pred)
967
968
969
970
971
972
973
974
975
976

    def test_mape_rf(self):
        X, y = load_boston(True)
        params = {
            'boosting_type': 'rf',
            'objective': 'mape',
            'verbose': -1,
            'bagging_freq': 1,
            'bagging_fraction': 0.8,
            'feature_fraction': 0.8,
Guolin Ke's avatar
Guolin Ke committed
977
            'boost_from_average': True
978
979
        }
        lgb_train = lgb.Dataset(X, y)
980
        gbm = lgb.train(params, lgb_train, num_boost_round=20)
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
        pred = gbm.predict(X)
        pred_mean = pred.mean()
        self.assertGreater(pred_mean, 20)

    def test_mape_dart(self):
        X, y = load_boston(True)
        params = {
            'boosting_type': 'dart',
            'objective': 'mape',
            'verbose': -1,
            'bagging_freq': 1,
            'bagging_fraction': 0.8,
            'feature_fraction': 0.8,
            'boost_from_average': False
        }
        lgb_train = lgb.Dataset(X, y)
997
        gbm = lgb.train(params, lgb_train, num_boost_round=40)
998
999
1000
        pred = gbm.predict(X)
        pred_mean = pred.mean()
        self.assertGreater(pred_mean, 18)
1001

1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
    def check_constant_features(self, y_true, expected_pred, more_params):
        X_train = np.ones((len(y_true), 1))
        y_train = np.array(y_true)
        params = {
            'objective': 'regression',
            'num_class': 1,
            'verbose': -1,
            'min_data': 1,
            'num_leaves': 2,
            'learning_rate': 1,
            'min_data_in_bin': 1,
            'boost_from_average': True
        }
        params.update(more_params)
        lgb_train = lgb.Dataset(X_train, y_train, params=params)
        gbm = lgb.train(params, lgb_train, num_boost_round=2)
        pred = gbm.predict(X_train)
        self.assertTrue(np.allclose(pred, expected_pred))
1020
1021
1022
1023
1024

    def test_constant_features_regression(self):
        params = {
            'objective': 'regression'
        }
1025
1026
1027
        self.check_constant_features([0.0, 10.0, 0.0, 10.0], 5.0, params)
        self.check_constant_features([0.0, 1.0, 2.0, 3.0], 1.5, params)
        self.check_constant_features([-1.0, 1.0, -2.0, 2.0], 0.0, params)
1028
1029
1030
1031
1032

    def test_constant_features_binary(self):
        params = {
            'objective': 'binary'
        }
1033
1034
        self.check_constant_features([0.0, 10.0, 0.0, 10.0], 0.5, params)
        self.check_constant_features([0.0, 1.0, 2.0, 3.0], 0.75, params)
1035
1036
1037
1038
1039
1040

    def test_constant_features_multiclass(self):
        params = {
            'objective': 'multiclass',
            'num_class': 3
        }
1041
1042
        self.check_constant_features([0.0, 1.0, 2.0, 0.0], [0.5, 0.25, 0.25], params)
        self.check_constant_features([0.0, 1.0, 2.0, 1.0], [0.25, 0.5, 0.25], params)
1043
1044
1045
1046
1047
1048

    def test_constant_features_multiclassova(self):
        params = {
            'objective': 'multiclassova',
            'num_class': 3
        }
1049
1050
        self.check_constant_features([0.0, 1.0, 2.0, 0.0], [0.5, 0.25, 0.25], params)
        self.check_constant_features([0.0, 1.0, 2.0, 1.0], [0.25, 0.5, 0.25], params)
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070

    def test_fpreproc(self):
        def preprocess_data(dtrain, dtest, params):
            train_data = dtrain.construct().get_data()
            test_data = dtest.construct().get_data()
            train_data[:, 0] += 1
            test_data[:, 0] += 1
            dtrain.label[-5:] = 3
            dtest.label[-5:] = 3
            dtrain = lgb.Dataset(train_data, dtrain.label)
            dtest = lgb.Dataset(test_data, dtest.label, reference=dtrain)
            params['num_class'] = 4
            return dtrain, dtest, params

        X, y = load_iris(True)
        dataset = lgb.Dataset(X, y, free_raw_data=False)
        params = {'objective': 'multiclass', 'num_class': 3, 'verbose': -1}
        results = lgb.cv(params, dataset, num_boost_round=10, fpreproc=preprocess_data)
        self.assertIn('multi_logloss-mean', results)
        self.assertEqual(len(results['multi_logloss-mean']), 10)
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094

    def test_metrics(self):
        X, y = load_digits(2, True)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
        lgb_train = lgb.Dataset(X_train, y_train, silent=True)
        lgb_valid = lgb.Dataset(X_test, y_test, reference=lgb_train, silent=True)

        evals_result = {}
        params_verbose = {'verbose': -1}
        params_obj_verbose = {'objective': 'binary', 'verbose': -1}
        params_obj_metric_log_verbose = {'objective': 'binary', 'metric': 'binary_logloss', 'verbose': -1}
        params_obj_metric_err_verbose = {'objective': 'binary', 'metric': 'binary_error', 'verbose': -1}
        params_obj_metric_inv_verbose = {'objective': 'binary', 'metric': 'invalid_metric', 'verbose': -1}
        params_obj_metric_multi_verbose = {'objective': 'binary',
                                           'metric': ['binary_logloss', 'binary_error'],
                                           'verbose': -1}
        params_obj_metric_none_verbose = {'objective': 'binary', 'metric': 'None', 'verbose': -1}
        params_metric_log_verbose = {'metric': 'binary_logloss', 'verbose': -1}
        params_metric_err_verbose = {'metric': 'binary_error', 'verbose': -1}
        params_metric_inv_verbose = {'metric_types': 'invalid_metric', 'verbose': -1}
        params_metric_multi_verbose = {'metric': ['binary_logloss', 'binary_error'], 'verbose': -1}
        params_metric_none_verbose = {'metric': 'None', 'verbose': -1}

        def get_cv_result(params=params_obj_verbose, **kwargs):
1095
            return lgb.cv(params, lgb_train, num_boost_round=2, verbose_eval=False, **kwargs)
1096
1097
1098

        def train_booster(params=params_obj_verbose, **kwargs):
            lgb.train(params, lgb_train,
1099
                      num_boost_round=2,
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
                      valid_sets=[lgb_valid],
                      evals_result=evals_result,
                      verbose_eval=False, **kwargs)

        # no fobj, no feval
        # default metric
        res = get_cv_result()
        self.assertEqual(len(res), 2)
        self.assertIn('binary_logloss-mean', res)

        # non-default metric in params
        res = get_cv_result(params=params_obj_metric_err_verbose)
        self.assertEqual(len(res), 2)
        self.assertIn('binary_error-mean', res)

        # default metric in args
        res = get_cv_result(metrics='binary_logloss')
        self.assertEqual(len(res), 2)
        self.assertIn('binary_logloss-mean', res)

        # non-default metric in args
        res = get_cv_result(metrics='binary_error')
        self.assertEqual(len(res), 2)
        self.assertIn('binary_error-mean', res)

        # metric in args overwrites one in params
        res = get_cv_result(params=params_obj_metric_inv_verbose, metrics='binary_error')
        self.assertEqual(len(res), 2)
        self.assertIn('binary_error-mean', res)

        # multiple metrics in params
        res = get_cv_result(params=params_obj_metric_multi_verbose)
        self.assertEqual(len(res), 4)
        self.assertIn('binary_logloss-mean', res)
        self.assertIn('binary_error-mean', res)

        # multiple metrics in args
        res = get_cv_result(metrics=['binary_logloss', 'binary_error'])
        self.assertEqual(len(res), 4)
        self.assertIn('binary_logloss-mean', res)
        self.assertIn('binary_error-mean', res)

        # remove default metric by 'None' in list
        res = get_cv_result(metrics=['None'])
        self.assertEqual(len(res), 0)

        # remove default metric by 'None' aliases
        for na_alias in ('None', 'na', 'null', 'custom'):
            res = get_cv_result(metrics=na_alias)
            self.assertEqual(len(res), 0)

        # fobj, no feval
        # no default metric
1153
        res = get_cv_result(params=params_verbose, fobj=dummy_obj)
1154
1155
1156
        self.assertEqual(len(res), 0)

        # metric in params
1157
        res = get_cv_result(params=params_metric_err_verbose, fobj=dummy_obj)
1158
1159
1160
1161
        self.assertEqual(len(res), 2)
        self.assertIn('binary_error-mean', res)

        # metric in args
1162
        res = get_cv_result(params=params_verbose, fobj=dummy_obj, metrics='binary_error')
1163
1164
1165
1166
        self.assertEqual(len(res), 2)
        self.assertIn('binary_error-mean', res)

        # metric in args overwrites its' alias in params
1167
        res = get_cv_result(params=params_metric_inv_verbose, fobj=dummy_obj, metrics='binary_error')
1168
1169
1170
1171
        self.assertEqual(len(res), 2)
        self.assertIn('binary_error-mean', res)

        # multiple metrics in params
1172
        res = get_cv_result(params=params_metric_multi_verbose, fobj=dummy_obj)
1173
1174
1175
1176
1177
        self.assertEqual(len(res), 4)
        self.assertIn('binary_logloss-mean', res)
        self.assertIn('binary_error-mean', res)

        # multiple metrics in args
1178
        res = get_cv_result(params=params_verbose, fobj=dummy_obj,
1179
1180
1181
1182
1183
1184
1185
                            metrics=['binary_logloss', 'binary_error'])
        self.assertEqual(len(res), 4)
        self.assertIn('binary_logloss-mean', res)
        self.assertIn('binary_error-mean', res)

        # no fobj, feval
        # default metric with custom one
1186
        res = get_cv_result(feval=constant_metric)
1187
1188
1189
1190
1191
        self.assertEqual(len(res), 4)
        self.assertIn('binary_logloss-mean', res)
        self.assertIn('error-mean', res)

        # non-default metric in params with custom one
1192
        res = get_cv_result(params=params_obj_metric_err_verbose, feval=constant_metric)
1193
1194
1195
1196
1197
        self.assertEqual(len(res), 4)
        self.assertIn('binary_error-mean', res)
        self.assertIn('error-mean', res)

        # default metric in args with custom one
1198
        res = get_cv_result(metrics='binary_logloss', feval=constant_metric)
1199
1200
1201
1202
1203
        self.assertEqual(len(res), 4)
        self.assertIn('binary_logloss-mean', res)
        self.assertIn('error-mean', res)

        # non-default metric in args with custom one
1204
        res = get_cv_result(metrics='binary_error', feval=constant_metric)
1205
1206
1207
1208
1209
        self.assertEqual(len(res), 4)
        self.assertIn('binary_error-mean', res)
        self.assertIn('error-mean', res)

        # metric in args overwrites one in params, custom one is evaluated too
1210
        res = get_cv_result(params=params_obj_metric_inv_verbose, metrics='binary_error', feval=constant_metric)
1211
1212
1213
1214
1215
        self.assertEqual(len(res), 4)
        self.assertIn('binary_error-mean', res)
        self.assertIn('error-mean', res)

        # multiple metrics in params with custom one
1216
        res = get_cv_result(params=params_obj_metric_multi_verbose, feval=constant_metric)
1217
1218
1219
1220
1221
1222
        self.assertEqual(len(res), 6)
        self.assertIn('binary_logloss-mean', res)
        self.assertIn('binary_error-mean', res)
        self.assertIn('error-mean', res)

        # multiple metrics in args with custom one
1223
        res = get_cv_result(metrics=['binary_logloss', 'binary_error'], feval=constant_metric)
1224
1225
1226
1227
1228
1229
        self.assertEqual(len(res), 6)
        self.assertIn('binary_logloss-mean', res)
        self.assertIn('binary_error-mean', res)
        self.assertIn('error-mean', res)

        # custom metric is evaluated despite 'None' is passed
1230
        res = get_cv_result(metrics=['None'], feval=constant_metric)
1231
1232
1233
1234
1235
        self.assertEqual(len(res), 2)
        self.assertIn('error-mean', res)

        # fobj, feval
        # no default metric, only custom one
1236
        res = get_cv_result(params=params_verbose, fobj=dummy_obj, feval=constant_metric)
1237
1238
1239
1240
        self.assertEqual(len(res), 2)
        self.assertIn('error-mean', res)

        # metric in params with custom one
1241
        res = get_cv_result(params=params_metric_err_verbose, fobj=dummy_obj, feval=constant_metric)
1242
1243
1244
1245
1246
        self.assertEqual(len(res), 4)
        self.assertIn('binary_error-mean', res)
        self.assertIn('error-mean', res)

        # metric in args with custom one
1247
1248
        res = get_cv_result(params=params_verbose, fobj=dummy_obj,
                            feval=constant_metric, metrics='binary_error')
1249
1250
1251
1252
1253
        self.assertEqual(len(res), 4)
        self.assertIn('binary_error-mean', res)
        self.assertIn('error-mean', res)

        # metric in args overwrites one in params, custom one is evaluated too
1254
1255
        res = get_cv_result(params=params_metric_inv_verbose, fobj=dummy_obj,
                            feval=constant_metric, metrics='binary_error')
1256
1257
1258
1259
1260
        self.assertEqual(len(res), 4)
        self.assertIn('binary_error-mean', res)
        self.assertIn('error-mean', res)

        # multiple metrics in params with custom one
1261
        res = get_cv_result(params=params_metric_multi_verbose, fobj=dummy_obj, feval=constant_metric)
1262
1263
1264
1265
1266
1267
        self.assertEqual(len(res), 6)
        self.assertIn('binary_logloss-mean', res)
        self.assertIn('binary_error-mean', res)
        self.assertIn('error-mean', res)

        # multiple metrics in args with custom one
1268
        res = get_cv_result(params=params_verbose, fobj=dummy_obj, feval=constant_metric,
1269
1270
1271
1272
1273
1274
1275
                            metrics=['binary_logloss', 'binary_error'])
        self.assertEqual(len(res), 6)
        self.assertIn('binary_logloss-mean', res)
        self.assertIn('binary_error-mean', res)
        self.assertIn('error-mean', res)

        # custom metric is evaluated despite 'None' is passed
1276
        res = get_cv_result(params=params_metric_none_verbose, fobj=dummy_obj, feval=constant_metric)
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
        self.assertEqual(len(res), 2)
        self.assertIn('error-mean', res)

        # no fobj, no feval
        # default metric
        train_booster()
        self.assertEqual(len(evals_result['valid_0']), 1)
        self.assertIn('binary_logloss', evals_result['valid_0'])

        # default metric in params
        train_booster(params=params_obj_metric_log_verbose)
        self.assertEqual(len(evals_result['valid_0']), 1)
        self.assertIn('binary_logloss', evals_result['valid_0'])

        # non-default metric in params
        train_booster(params=params_obj_metric_err_verbose)
        self.assertEqual(len(evals_result['valid_0']), 1)
        self.assertIn('binary_error', evals_result['valid_0'])

        # multiple metrics in params
        train_booster(params=params_obj_metric_multi_verbose)
        self.assertEqual(len(evals_result['valid_0']), 2)
        self.assertIn('binary_logloss', evals_result['valid_0'])
        self.assertIn('binary_error', evals_result['valid_0'])

        # remove default metric by 'None' aliases
        for na_alias in ('None', 'na', 'null', 'custom'):
            params = {'objective': 'binary', 'metric': na_alias, 'verbose': -1}
            train_booster(params=params)
            self.assertEqual(len(evals_result), 0)

        # fobj, no feval
        # no default metric
1310
        train_booster(params=params_verbose, fobj=dummy_obj)
1311
1312
1313
        self.assertEqual(len(evals_result), 0)

        # metric in params
1314
        train_booster(params=params_metric_log_verbose, fobj=dummy_obj)
1315
1316
1317
1318
        self.assertEqual(len(evals_result['valid_0']), 1)
        self.assertIn('binary_logloss', evals_result['valid_0'])

        # multiple metrics in params
1319
        train_booster(params=params_metric_multi_verbose, fobj=dummy_obj)
1320
1321
1322
1323
1324
1325
        self.assertEqual(len(evals_result['valid_0']), 2)
        self.assertIn('binary_logloss', evals_result['valid_0'])
        self.assertIn('binary_error', evals_result['valid_0'])

        # no fobj, feval
        # default metric with custom one
1326
        train_booster(feval=constant_metric)
1327
1328
1329
1330
1331
        self.assertEqual(len(evals_result['valid_0']), 2)
        self.assertIn('binary_logloss', evals_result['valid_0'])
        self.assertIn('error', evals_result['valid_0'])

        # default metric in params with custom one
1332
        train_booster(params=params_obj_metric_log_verbose, feval=constant_metric)
1333
1334
1335
1336
1337
        self.assertEqual(len(evals_result['valid_0']), 2)
        self.assertIn('binary_logloss', evals_result['valid_0'])
        self.assertIn('error', evals_result['valid_0'])

        # non-default metric in params with custom one
1338
        train_booster(params=params_obj_metric_err_verbose, feval=constant_metric)
1339
1340
1341
1342
1343
        self.assertEqual(len(evals_result['valid_0']), 2)
        self.assertIn('binary_error', evals_result['valid_0'])
        self.assertIn('error', evals_result['valid_0'])

        # multiple metrics in params with custom one
1344
        train_booster(params=params_obj_metric_multi_verbose, feval=constant_metric)
1345
1346
1347
1348
1349
1350
        self.assertEqual(len(evals_result['valid_0']), 3)
        self.assertIn('binary_logloss', evals_result['valid_0'])
        self.assertIn('binary_error', evals_result['valid_0'])
        self.assertIn('error', evals_result['valid_0'])

        # custom metric is evaluated despite 'None' is passed
1351
        train_booster(params=params_obj_metric_none_verbose, feval=constant_metric)
1352
1353
1354
1355
1356
        self.assertEqual(len(evals_result), 1)
        self.assertIn('error', evals_result['valid_0'])

        # fobj, feval
        # no default metric, only custom one
1357
        train_booster(params=params_verbose, fobj=dummy_obj, feval=constant_metric)
1358
1359
1360
1361
        self.assertEqual(len(evals_result['valid_0']), 1)
        self.assertIn('error', evals_result['valid_0'])

        # metric in params with custom one
1362
        train_booster(params=params_metric_log_verbose, fobj=dummy_obj, feval=constant_metric)
1363
1364
1365
1366
1367
        self.assertEqual(len(evals_result['valid_0']), 2)
        self.assertIn('binary_logloss', evals_result['valid_0'])
        self.assertIn('error', evals_result['valid_0'])

        # multiple metrics in params with custom one
1368
        train_booster(params=params_metric_multi_verbose, fobj=dummy_obj, feval=constant_metric)
1369
1370
1371
1372
1373
1374
        self.assertEqual(len(evals_result['valid_0']), 3)
        self.assertIn('binary_logloss', evals_result['valid_0'])
        self.assertIn('binary_error', evals_result['valid_0'])
        self.assertIn('error', evals_result['valid_0'])

        # custom metric is evaluated despite 'None' is passed
1375
        train_booster(params=params_metric_none_verbose, fobj=dummy_obj, feval=constant_metric)
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
        self.assertEqual(len(evals_result), 1)
        self.assertIn('error', evals_result['valid_0'])

        X, y = load_digits(3, True)
        lgb_train = lgb.Dataset(X, y, silent=True)

        obj_multi_aliases = ['multiclass', 'softmax', 'multiclassova', 'multiclass_ova', 'ova', 'ovr']
        for obj_multi_alias in obj_multi_aliases:
            params_obj_class_3_verbose = {'objective': obj_multi_alias, 'num_class': 3, 'verbose': -1}
            params_obj_class_1_verbose = {'objective': obj_multi_alias, 'num_class': 1, 'verbose': -1}
            params_obj_verbose = {'objective': obj_multi_alias, 'verbose': -1}
            # multiclass default metric
            res = get_cv_result(params_obj_class_3_verbose)
            self.assertEqual(len(res), 2)
            self.assertIn('multi_logloss-mean', res)
            # multiclass default metric with custom one
1392
            res = get_cv_result(params_obj_class_3_verbose, feval=constant_metric)
1393
1394
1395
1396
            self.assertEqual(len(res), 4)
            self.assertIn('multi_logloss-mean', res)
            self.assertIn('error-mean', res)
            # multiclass metric alias with custom one for custom objective
1397
            res = get_cv_result(params_obj_class_3_verbose, fobj=dummy_obj, feval=constant_metric)
1398
1399
1400
            self.assertEqual(len(res), 2)
            self.assertIn('error-mean', res)
            # no metric for invalid class_num
1401
            res = get_cv_result(params_obj_class_1_verbose, fobj=dummy_obj)
1402
1403
            self.assertEqual(len(res), 0)
            # custom metric for invalid class_num
1404
            res = get_cv_result(params_obj_class_1_verbose, fobj=dummy_obj, feval=constant_metric)
1405
1406
1407
1408
1409
            self.assertEqual(len(res), 2)
            self.assertIn('error-mean', res)
            # multiclass metric alias with custom one with invalid class_num
            self.assertRaises(lgb.basic.LightGBMError, get_cv_result,
                              params_obj_class_1_verbose, metrics=obj_multi_alias,
1410
                              fobj=dummy_obj, feval=constant_metric)
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
            # multiclass default metric without num_class
            self.assertRaises(lgb.basic.LightGBMError, get_cv_result,
                              params_obj_verbose)
            for metric_multi_alias in obj_multi_aliases + ['multi_logloss']:
                # multiclass metric alias
                res = get_cv_result(params_obj_class_3_verbose, metrics=metric_multi_alias)
                self.assertEqual(len(res), 2)
                self.assertIn('multi_logloss-mean', res)
            # multiclass metric
            res = get_cv_result(params_obj_class_3_verbose, metrics='multi_error')
            self.assertEqual(len(res), 2)
            self.assertIn('multi_error-mean', res)
            # non-valid metric for multiclass objective
            self.assertRaises(lgb.basic.LightGBMError, get_cv_result,
                              params_obj_class_3_verbose, metrics='binary_logloss')
        params_class_3_verbose = {'num_class': 3, 'verbose': -1}
        # non-default num_class for default objective
        self.assertRaises(lgb.basic.LightGBMError, get_cv_result,
                          params_class_3_verbose)
        # no metric with non-default num_class for custom objective
1431
        res = get_cv_result(params_class_3_verbose, fobj=dummy_obj)
1432
1433
1434
        self.assertEqual(len(res), 0)
        for metric_multi_alias in obj_multi_aliases + ['multi_logloss']:
            # multiclass metric alias for custom objective
1435
            res = get_cv_result(params_class_3_verbose, metrics=metric_multi_alias, fobj=dummy_obj)
1436
1437
1438
            self.assertEqual(len(res), 2)
            self.assertIn('multi_logloss-mean', res)
        # multiclass metric for custom objective
1439
        res = get_cv_result(params_class_3_verbose, metrics='multi_error', fobj=dummy_obj)
1440
1441
1442
1443
        self.assertEqual(len(res), 2)
        self.assertIn('multi_error-mean', res)
        # binary metric with non-default num_class for custom objective
        self.assertRaises(lgb.basic.LightGBMError, get_cv_result,
1444
                          params_class_3_verbose, metrics='binary_error', fobj=dummy_obj)
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471

    @unittest.skipIf(psutil.virtual_memory().available / 1024 / 1024 / 1024 < 3, 'not enough RAM')
    def test_model_size(self):
        X, y = load_boston(True)
        data = lgb.Dataset(X, y)
        bst = lgb.train({'verbose': -1}, data, num_boost_round=2)
        y_pred = bst.predict(X)
        model_str = bst.model_to_string()
        one_tree = model_str[model_str.find('Tree=1'):model_str.find('end of trees')]
        one_tree_size = len(one_tree)
        one_tree = one_tree.replace('Tree=1', 'Tree={}')
        multiplier = 100
        total_trees = multiplier + 2
        try:
            new_model_str = (model_str[:model_str.find('tree_sizes')]
                             + '\n\n'
                             + model_str[model_str.find('Tree=0'):model_str.find('end of trees')]
                             + (one_tree * multiplier).format(*range(2, total_trees))
                             + model_str[model_str.find('end of trees'):]
                             + ' ' * (2**31 - one_tree_size * total_trees))
            self.assertGreater(len(new_model_str), 2**31)
            bst.model_from_string(new_model_str, verbose=False)
            self.assertEqual(bst.num_trees(), total_trees)
            y_pred_new = bst.predict(X, num_iteration=2)
            np.testing.assert_allclose(y_pred, y_pred_new)
        except MemoryError:
            self.skipTest('not enough RAM')
1472
1473
1474

    def test_get_split_value_histogram(self):
        X, y = load_boston(True)
1475
        lgb_train = lgb.Dataset(X, y, categorical_feature=[2])
1476
1477
1478
        gbm = lgb.train({'verbose': -1}, lgb_train, num_boost_round=20)
        # test XGBoost-style return value
        params = {'feature': 0, 'xgboost_style': True}
1479
1480
        self.assertTupleEqual(gbm.get_split_value_histogram(**params).shape, (9, 2))
        self.assertTupleEqual(gbm.get_split_value_histogram(bins=999, **params).shape, (9, 2))
1481
1482
1483
1484
        self.assertTupleEqual(gbm.get_split_value_histogram(bins=-1, **params).shape, (1, 2))
        self.assertTupleEqual(gbm.get_split_value_histogram(bins=0, **params).shape, (1, 2))
        self.assertTupleEqual(gbm.get_split_value_histogram(bins=1, **params).shape, (1, 2))
        self.assertTupleEqual(gbm.get_split_value_histogram(bins=2, **params).shape, (2, 2))
1485
        self.assertTupleEqual(gbm.get_split_value_histogram(bins=6, **params).shape, (5, 2))
1486
1487
        self.assertTupleEqual(gbm.get_split_value_histogram(bins=7, **params).shape, (6, 2))
        if lgb.compat.PANDAS_INSTALLED:
1488
            np.testing.assert_allclose(
1489
1490
1491
                gbm.get_split_value_histogram(0, xgboost_style=True).values,
                gbm.get_split_value_histogram(gbm.feature_name()[0], xgboost_style=True).values
            )
1492
            np.testing.assert_allclose(
1493
1494
1495
1496
                gbm.get_split_value_histogram(X.shape[-1] - 1, xgboost_style=True).values,
                gbm.get_split_value_histogram(gbm.feature_name()[X.shape[-1] - 1], xgboost_style=True).values
            )
        else:
1497
            np.testing.assert_allclose(
1498
1499
1500
                gbm.get_split_value_histogram(0, xgboost_style=True),
                gbm.get_split_value_histogram(gbm.feature_name()[0], xgboost_style=True)
            )
1501
            np.testing.assert_allclose(
1502
1503
1504
1505
1506
                gbm.get_split_value_histogram(X.shape[-1] - 1, xgboost_style=True),
                gbm.get_split_value_histogram(gbm.feature_name()[X.shape[-1] - 1], xgboost_style=True)
            )
        # test numpy-style return value
        hist, bins = gbm.get_split_value_histogram(0)
1507
1508
        self.assertEqual(len(hist), 23)
        self.assertEqual(len(bins), 24)
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
        hist, bins = gbm.get_split_value_histogram(0, bins=999)
        self.assertEqual(len(hist), 999)
        self.assertEqual(len(bins), 1000)
        self.assertRaises(ValueError, gbm.get_split_value_histogram, 0, bins=-1)
        self.assertRaises(ValueError, gbm.get_split_value_histogram, 0, bins=0)
        hist, bins = gbm.get_split_value_histogram(0, bins=1)
        self.assertEqual(len(hist), 1)
        self.assertEqual(len(bins), 2)
        hist, bins = gbm.get_split_value_histogram(0, bins=2)
        self.assertEqual(len(hist), 2)
        self.assertEqual(len(bins), 3)
        hist, bins = gbm.get_split_value_histogram(0, bins=6)
        self.assertEqual(len(hist), 6)
        self.assertEqual(len(bins), 7)
        hist, bins = gbm.get_split_value_histogram(0, bins=7)
        self.assertEqual(len(hist), 7)
        self.assertEqual(len(bins), 8)
        hist_idx, bins_idx = gbm.get_split_value_histogram(0)
        hist_name, bins_name = gbm.get_split_value_histogram(gbm.feature_name()[0])
        np.testing.assert_array_equal(hist_idx, hist_name)
1529
        np.testing.assert_allclose(bins_idx, bins_name)
1530
1531
1532
        hist_idx, bins_idx = gbm.get_split_value_histogram(X.shape[-1] - 1)
        hist_name, bins_name = gbm.get_split_value_histogram(gbm.feature_name()[X.shape[-1] - 1])
        np.testing.assert_array_equal(hist_idx, hist_name)
1533
        np.testing.assert_allclose(bins_idx, bins_name)
1534
1535
1536
1537
1538
1539
1540
        # test bins string type
        if np.__version__ > '1.11.0':
            hist_vals, bin_edges = gbm.get_split_value_histogram(0, bins='auto')
            hist = gbm.get_split_value_histogram(0, bins='auto', xgboost_style=True)
            if lgb.compat.PANDAS_INSTALLED:
                mask = hist_vals > 0
                np.testing.assert_array_equal(hist_vals[mask], hist['Count'].values)
1541
                np.testing.assert_allclose(bin_edges[1:][mask], hist['SplitValue'].values)
1542
1543
1544
            else:
                mask = hist_vals > 0
                np.testing.assert_array_equal(hist_vals[mask], hist[:, 1])
1545
                np.testing.assert_allclose(bin_edges[1:][mask], hist[:, 0])
1546
1547
        # test histogram is disabled for categorical features
        self.assertRaises(lgb.basic.LightGBMError, gbm.get_split_value_histogram, 2)
1548
1549
1550

    def test_early_stopping_for_only_first_metric(self):

1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
        def metrics_combination_train_regression(valid_sets, metric_list, assumed_iteration,
                                                 first_metric_only, feval=None):
            params = {
                'objective': 'regression',
                'learning_rate': 1.1,
                'num_leaves': 10,
                'metric': metric_list,
                'verbose': -1,
                'seed': 123
            }
            gbm = lgb.train(dict(params, first_metric_only=first_metric_only), lgb_train,
                            num_boost_round=25, valid_sets=valid_sets, feval=feval,
                            early_stopping_rounds=5, verbose_eval=False)
            self.assertEqual(assumed_iteration, gbm.best_iteration)

        def metrics_combination_cv_regression(metric_list, assumed_iteration,
                                              first_metric_only, eval_train_metric, feval=None):
            params = {
                'objective': 'regression',
                'learning_rate': 0.9,
                'num_leaves': 10,
                'metric': metric_list,
                'verbose': -1,
                'seed': 123,
                'gpu_use_dp': True
            }
            ret = lgb.cv(dict(params, first_metric_only=first_metric_only),
                         train_set=lgb_train, num_boost_round=25,
                         stratified=False, feval=feval,
                         early_stopping_rounds=5, verbose_eval=False,
                         eval_train_metric=eval_train_metric)
            self.assertEqual(assumed_iteration, len(ret[list(ret.keys())[0]]))

        X, y = load_boston(True)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        X_test1, X_test2, y_test1, y_test2 = train_test_split(X_test, y_test, test_size=0.5, random_state=73)
        lgb_train = lgb.Dataset(X_train, y_train)
        lgb_valid1 = lgb.Dataset(X_test1, y_test1, reference=lgb_train)
        lgb_valid2 = lgb.Dataset(X_test2, y_test2, reference=lgb_train)

        iter_valid1_l1 = 3
        iter_valid1_l2 = 14
        iter_valid2_l1 = 2
        iter_valid2_l2 = 15
        self.assertEqual(len(set([iter_valid1_l1, iter_valid1_l2, iter_valid2_l1, iter_valid2_l2])), 4)
        iter_min_l1 = min([iter_valid1_l1, iter_valid2_l1])
        iter_min_l2 = min([iter_valid1_l2, iter_valid2_l2])
        iter_min = min([iter_min_l1, iter_min_l2])
        iter_min_valid1 = min([iter_valid1_l1, iter_valid1_l2])

1601
1602
        iter_cv_l1 = 4
        iter_cv_l2 = 12
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
        self.assertEqual(len(set([iter_cv_l1, iter_cv_l2])), 2)
        iter_cv_min = min([iter_cv_l1, iter_cv_l2])

        # test for lgb.train
        metrics_combination_train_regression(lgb_valid1, [], iter_valid1_l2, False)
        metrics_combination_train_regression(lgb_valid1, [], iter_valid1_l2, True)
        metrics_combination_train_regression(lgb_valid1, None, iter_valid1_l2, False)
        metrics_combination_train_regression(lgb_valid1, None, iter_valid1_l2, True)
        metrics_combination_train_regression(lgb_valid1, 'l2', iter_valid1_l2, True)
        metrics_combination_train_regression(lgb_valid1, 'l1', iter_valid1_l1, True)
        metrics_combination_train_regression(lgb_valid1, ['l2', 'l1'], iter_valid1_l2, True)
        metrics_combination_train_regression(lgb_valid1, ['l1', 'l2'], iter_valid1_l1, True)
        metrics_combination_train_regression(lgb_valid1, ['l2', 'l1'], iter_min_valid1, False)
        metrics_combination_train_regression(lgb_valid1, ['l1', 'l2'], iter_min_valid1, False)

        # test feval for lgb.train
        metrics_combination_train_regression(lgb_valid1, 'None', 1, False,
                                             feval=lambda preds, train_data: [decreasing_metric(preds, train_data),
                                                                              constant_metric(preds, train_data)])
        metrics_combination_train_regression(lgb_valid1, 'None', 25, True,
                                             feval=lambda preds, train_data: [decreasing_metric(preds, train_data),
                                                                              constant_metric(preds, train_data)])
        metrics_combination_train_regression(lgb_valid1, 'None', 1, True,
                                             feval=lambda preds, train_data: [constant_metric(preds, train_data),
                                                                              decreasing_metric(preds, train_data)])

        # test with two valid data for lgb.train
        metrics_combination_train_regression([lgb_valid1, lgb_valid2], ['l2', 'l1'], iter_min_l2, True)
        metrics_combination_train_regression([lgb_valid2, lgb_valid1], ['l2', 'l1'], iter_min_l2, True)
        metrics_combination_train_regression([lgb_valid1, lgb_valid2], ['l1', 'l2'], iter_min_l1, True)
        metrics_combination_train_regression([lgb_valid2, lgb_valid1], ['l1', 'l2'], iter_min_l1, True)

        # test for lgb.cv
        metrics_combination_cv_regression(None, iter_cv_l2, True, False)
        metrics_combination_cv_regression('l2', iter_cv_l2, True, False)
        metrics_combination_cv_regression('l1', iter_cv_l1, True, False)
        metrics_combination_cv_regression(['l2', 'l1'], iter_cv_l2, True, False)
        metrics_combination_cv_regression(['l1', 'l2'], iter_cv_l1, True, False)
        metrics_combination_cv_regression(['l2', 'l1'], iter_cv_min, False, False)
        metrics_combination_cv_regression(['l1', 'l2'], iter_cv_min, False, False)
        metrics_combination_cv_regression(None, iter_cv_l2, True, True)
        metrics_combination_cv_regression('l2', iter_cv_l2, True, True)
        metrics_combination_cv_regression('l1', iter_cv_l1, True, True)
        metrics_combination_cv_regression(['l2', 'l1'], iter_cv_l2, True, True)
        metrics_combination_cv_regression(['l1', 'l2'], iter_cv_l1, True, True)
        metrics_combination_cv_regression(['l2', 'l1'], iter_cv_min, False, True)
        metrics_combination_cv_regression(['l1', 'l2'], iter_cv_min, False, True)

        # test feval for lgb.cv
        metrics_combination_cv_regression('None', 1, False, False,
                                          feval=lambda preds, train_data: [decreasing_metric(preds, train_data),
                                                                           constant_metric(preds, train_data)])
        metrics_combination_cv_regression('None', 25, True, False,
                                          feval=lambda preds, train_data: [decreasing_metric(preds, train_data),
                                                                           constant_metric(preds, train_data)])
        metrics_combination_cv_regression('None', 1, True, False,
                                          feval=lambda preds, train_data: [constant_metric(preds, train_data),
                                                                           decreasing_metric(preds, train_data)])
1661
1662
1663
1664
1665
1666
1667

    def test_node_level_subcol(self):
        X, y = load_breast_cancer(True)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
        params = {
            'objective': 'binary',
            'metric': 'binary_logloss',
1668
1669
            'feature_fraction_bynode': 0.8,
            'feature_fraction': 1.0,
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
            'verbose': -1
        }
        lgb_train = lgb.Dataset(X_train, y_train)
        lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
        evals_result = {}
        gbm = lgb.train(params, lgb_train,
                        num_boost_round=25,
                        valid_sets=lgb_eval,
                        verbose_eval=False,
                        evals_result=evals_result)
        ret = log_loss(y_test, gbm.predict(X_test))
        self.assertLess(ret, 0.13)
        self.assertAlmostEqual(evals_result['valid_0']['binary_logloss'][-1], ret, places=5)
        params['feature_fraction'] = 0.5
1684
        gbm2 = lgb.train(params, lgb_train, num_boost_round=25)
1685
1686
        ret2 = log_loss(y_test, gbm2.predict(X_test))
        self.assertNotEqual(ret, ret2)
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699

    def test_forced_bins(self):
        x = np.zeros((100, 2))
        x[:, 0] = np.arange(0, 1, 0.01)
        x[:, 1] = -np.arange(0, 1, 0.01)
        y = np.arange(0, 1, 0.01)
        forcedbins_filename = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                                           '../../examples/regression/forced_bins.json')
        params = {'objective': 'regression_l1',
                  'max_bin': 5,
                  'forcedbins_filename': forcedbins_filename,
                  'num_leaves': 2,
                  'min_data_in_leaf': 1,
1700
                  'verbose': -1}
1701
        lgb_x = lgb.Dataset(x, label=y)
1702
        est = lgb.train(params, lgb_x, num_boost_round=20)
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
        new_x = np.zeros((3, x.shape[1]))
        new_x[:, 0] = [0.31, 0.37, 0.41]
        new_x[:, 1] = [0, 0, 0]
        predicted = est.predict(new_x)
        self.assertEqual(len(np.unique(predicted)), 3)
        new_x[:, 0] = [0, 0, 0]
        new_x[:, 1] = [-0.9, -0.6, -0.3]
        predicted = est.predict(new_x)
        self.assertEqual(len(np.unique(predicted)), 1)
        params['forcedbins_filename'] = ''
        lgb_x = lgb.Dataset(x, label=y)
1714
        est = lgb.train(params, lgb_x, num_boost_round=20)
1715
1716
1717
1718
1719
1720
        predicted = est.predict(new_x)
        self.assertEqual(len(np.unique(predicted)), 3)
        params['forcedbins_filename'] = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                                                     '../../examples/regression/forced_bins2.json')
        params['max_bin'] = 11
        lgb_x = lgb.Dataset(x[:, :1], label=y)
1721
        est = lgb.train(params, lgb_x, num_boost_round=50)
1722
        predicted = est.predict(x[1:, :1])
1723
        _, counts = np.unique(predicted, return_counts=True)
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
        self.assertGreaterEqual(min(counts), 9)
        self.assertLessEqual(max(counts), 11)

    def test_binning_same_sign(self):
        # test that binning works properly for features with only positive or only negative values
        x = np.zeros((99, 2))
        x[:, 0] = np.arange(0.01, 1, 0.01)
        x[:, 1] = -np.arange(0.01, 1, 0.01)
        y = np.arange(0.01, 1, 0.01)
        params = {'objective': 'regression_l1',
                  'max_bin': 5,
                  'num_leaves': 2,
                  'min_data_in_leaf': 1,
                  'verbose': -1,
                  'seed': 0}
        lgb_x = lgb.Dataset(x, label=y)
1740
        est = lgb.train(params, lgb_x, num_boost_round=20)
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
        new_x = np.zeros((3, 2))
        new_x[:, 0] = [-1, 0, 1]
        predicted = est.predict(new_x)
        self.assertAlmostEqual(predicted[0], predicted[1])
        self.assertNotAlmostEqual(predicted[1], predicted[2])
        new_x = np.zeros((3, 2))
        new_x[:, 1] = [-1, 0, 1]
        predicted = est.predict(new_x)
        self.assertNotAlmostEqual(predicted[0], predicted[1])
        self.assertAlmostEqual(predicted[1], predicted[2])