test_plotting.py 19.8 KB
Newer Older
1
# coding: utf-8
2
import numpy as np
3
import pandas as pd
4
import pytest
5
6
7
from sklearn.model_selection import train_test_split

import lightgbm as lgb
8
from lightgbm.compat import GRAPHVIZ_INSTALLED, MATPLOTLIB_INSTALLED, PANDAS_INSTALLED, pd_DataFrame
9

10
if MATPLOTLIB_INSTALLED:
wxchan's avatar
wxchan committed
11
    import matplotlib
12
13

    matplotlib.use("Agg")
14
15
if GRAPHVIZ_INSTALLED:
    import graphviz
16

17
from .utils import load_breast_cancer, make_synthetic_regression
18

19

20
21
@pytest.fixture(scope="module")
def breast_cancer_split():
22
    return train_test_split(*load_breast_cancer(return_X_y=True), test_size=0.1, random_state=1)
23
24


25
26
27
28
29
30
31
32
33
34
35
def _categorical_data(category_values_lower_bound, category_values_upper_bound):
    X, y = load_breast_cancer(return_X_y=True)
    X_df = pd.DataFrame()
    rnd = np.random.RandomState(0)
    n_cat_values = rnd.randint(category_values_lower_bound, category_values_upper_bound, size=X.shape[1])
    for i in range(X.shape[1]):
        bins = np.linspace(0, 1, num=n_cat_values[i] + 1)
        X_df[f"cat_col_{i}"] = pd.qcut(X[:, i], q=bins, labels=range(n_cat_values[i])).as_unordered()
    return X_df, y


36
37
38
39
40
@pytest.fixture(scope="module")
def train_data(breast_cancer_split):
    X_train, _, y_train, _ = breast_cancer_split
    return lgb.Dataset(X_train, y_train)

41

42
43
@pytest.fixture
def params():
44
    return {"objective": "binary", "verbose": -1, "num_leaves": 3}
45
46


47
@pytest.mark.skipif(not MATPLOTLIB_INSTALLED, reason="matplotlib is not installed")
48
49
50
51
52
53
def test_plot_importance(params, breast_cancer_split, train_data):
    X_train, _, y_train, _ = breast_cancer_split

    gbm0 = lgb.train(params, train_data, num_boost_round=10)
    ax0 = lgb.plot_importance(gbm0)
    assert isinstance(ax0, matplotlib.axes.Axes)
54
55
56
    assert ax0.get_title() == "Feature importance"
    assert ax0.get_xlabel() == "Feature importance"
    assert ax0.get_ylabel() == "Features"
57
58
    assert len(ax0.patches) <= 30

59
    gbm1 = lgb.LGBMClassifier(n_estimators=10, num_leaves=3, verbose=-1)
60
61
    gbm1.fit(X_train, y_train)

62
    ax1 = lgb.plot_importance(gbm1, color="r", title="t", xlabel="x", ylabel="y")
63
    assert isinstance(ax1, matplotlib.axes.Axes)
64
65
66
    assert ax1.get_title() == "t"
    assert ax1.get_xlabel() == "x"
    assert ax1.get_ylabel() == "y"
67
68
    assert len(ax1.patches) <= 30
    for patch in ax1.patches:
69
        assert patch.get_facecolor() == (1.0, 0, 0, 1.0)  # red
70

71
    ax2 = lgb.plot_importance(gbm0, color=["r", "y", "g", "b"], title=None, xlabel=None, ylabel=None)
72
    assert isinstance(ax2, matplotlib.axes.Axes)
73
74
75
    assert ax2.get_title() == ""
    assert ax2.get_xlabel() == ""
    assert ax2.get_ylabel() == ""
76
    assert len(ax2.patches) <= 30
77
78
79
80
    assert ax2.patches[0].get_facecolor() == (1.0, 0, 0, 1.0)  # r
    assert ax2.patches[1].get_facecolor() == (0.75, 0.75, 0, 1.0)  # y
    assert ax2.patches[2].get_facecolor() == (0, 0.5, 0, 1.0)  # g
    assert ax2.patches[3].get_facecolor() == (0, 0, 1.0, 1.0)  # b
81

82
83
84
    ax3 = lgb.plot_importance(
        gbm0, title="t @importance_type@", xlabel="x @importance_type@", ylabel="y @importance_type@"
    )
85
    assert isinstance(ax3, matplotlib.axes.Axes)
86
87
88
    assert ax3.get_title() == "t @importance_type@"
    assert ax3.get_xlabel() == "x split"
    assert ax3.get_ylabel() == "y @importance_type@"
89
90
    assert len(ax3.patches) <= 30

91
    gbm2 = lgb.LGBMClassifier(n_estimators=10, num_leaves=3, verbose=-1, importance_type="gain")
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
    gbm2.fit(X_train, y_train)

    def get_bounds_of_first_patch(axes):
        return axes.patches[0].get_extents().bounds

    first_bar1 = get_bounds_of_first_patch(lgb.plot_importance(gbm1))
    first_bar2 = get_bounds_of_first_patch(lgb.plot_importance(gbm1, importance_type="split"))
    first_bar3 = get_bounds_of_first_patch(lgb.plot_importance(gbm1, importance_type="gain"))
    first_bar4 = get_bounds_of_first_patch(lgb.plot_importance(gbm2))
    first_bar5 = get_bounds_of_first_patch(lgb.plot_importance(gbm2, importance_type="split"))
    first_bar6 = get_bounds_of_first_patch(lgb.plot_importance(gbm2, importance_type="gain"))

    assert first_bar1 == first_bar2
    assert first_bar1 == first_bar5
    assert first_bar3 == first_bar4
    assert first_bar3 == first_bar6
    assert first_bar1 != first_bar3

110

111
@pytest.mark.skipif(not MATPLOTLIB_INSTALLED, reason="matplotlib is not installed")
112
113
114
115
116
117
def test_plot_split_value_histogram(params, breast_cancer_split, train_data):
    X_train, _, y_train, _ = breast_cancer_split

    gbm0 = lgb.train(params, train_data, num_boost_round=10)
    ax0 = lgb.plot_split_value_histogram(gbm0, 27)
    assert isinstance(ax0, matplotlib.axes.Axes)
118
119
120
    assert ax0.get_title() == "Split value histogram for feature with index 27"
    assert ax0.get_xlabel() == "Feature split value"
    assert ax0.get_ylabel() == "Count"
121
122
    assert len(ax0.patches) <= 2

123
    gbm1 = lgb.LGBMClassifier(n_estimators=10, num_leaves=3, verbose=-1)
124
125
    gbm1.fit(X_train, y_train)

126
127
128
129
130
131
132
133
134
    ax1 = lgb.plot_split_value_histogram(
        gbm1,
        gbm1.booster_.feature_name()[27],
        figsize=(10, 5),
        title="Histogram for feature @index/name@ @feature@",
        xlabel="x",
        ylabel="y",
        color="r",
    )
135
    assert isinstance(ax1, matplotlib.axes.Axes)
136
    title = f"Histogram for feature name {gbm1.booster_.feature_name()[27]}"
137
    assert ax1.get_title() == title
138
139
    assert ax1.get_xlabel() == "x"
    assert ax1.get_ylabel() == "y"
140
141
    assert len(ax1.patches) <= 2
    for patch in ax1.patches:
142
        assert patch.get_facecolor() == (1.0, 0, 0, 1.0)  # red
143

144
145
146
    ax2 = lgb.plot_split_value_histogram(
        gbm0, 27, bins=10, color=["r", "y", "g", "b"], title=None, xlabel=None, ylabel=None
    )
147
    assert isinstance(ax2, matplotlib.axes.Axes)
148
149
150
    assert ax2.get_title() == ""
    assert ax2.get_xlabel() == ""
    assert ax2.get_ylabel() == ""
151
    assert len(ax2.patches) == 10
152
153
154
155
    assert ax2.patches[0].get_facecolor() == (1.0, 0, 0, 1.0)  # r
    assert ax2.patches[1].get_facecolor() == (0.75, 0.75, 0, 1.0)  # y
    assert ax2.patches[2].get_facecolor() == (0, 0.5, 0, 1.0)  # g
    assert ax2.patches[3].get_facecolor() == (0, 0, 1.0, 1.0)  # b
156
157
158
159
160

    with pytest.raises(ValueError):
        lgb.plot_split_value_histogram(gbm0, 0)  # was not used in splitting


161
162
163
@pytest.mark.skipif(
    not MATPLOTLIB_INSTALLED or not GRAPHVIZ_INSTALLED, reason="matplotlib or graphviz is not installed"
)
164
165
def test_plot_tree(breast_cancer_split):
    X_train, _, y_train, _ = breast_cancer_split
166
    gbm = lgb.LGBMClassifier(n_estimators=10, num_leaves=3, verbose=-1)
167
    gbm.fit(X_train, y_train)
168
169
170
171

    with pytest.raises(IndexError):
        lgb.plot_tree(gbm, tree_index=83)

172
    ax = lgb.plot_tree(gbm, tree_index=3, figsize=(15, 8), show_info=["split_gain"])
173
174
175
176
177
178
    assert isinstance(ax, matplotlib.axes.Axes)
    w, h = ax.axes.get_figure().get_size_inches()
    assert int(w) == 15
    assert int(h) == 8


179
@pytest.mark.skipif(not GRAPHVIZ_INSTALLED, reason="graphviz is not installed")
180
181
182
183
def test_create_tree_digraph(breast_cancer_split):
    X_train, _, y_train, _ = breast_cancer_split

    constraints = [-1, 1] * int(X_train.shape[1] / 2)
184
    gbm = lgb.LGBMClassifier(n_estimators=10, num_leaves=3, verbose=-1, monotone_constraints=constraints)
185
    gbm.fit(X_train, y_train)
186
187
188
189

    with pytest.raises(IndexError):
        lgb.create_tree_digraph(gbm, tree_index=83)

190
191
192
193
194
195
196
    graph = lgb.create_tree_digraph(
        gbm,
        tree_index=3,
        show_info=["split_gain", "internal_value", "internal_weight"],
        name="Tree4",
        node_attr={"color": "red"},
    )
197
198
    graph.render(view=False)
    assert isinstance(graph, graphviz.Digraph)
199
    assert graph.name == "Tree4"
200
    assert len(graph.node_attr) == 1
201
    assert graph.node_attr["color"] == "red"
202
203
    assert len(graph.graph_attr) == 0
    assert len(graph.edge_attr) == 0
204
205
206
207
208
209
210
211
212
213
214
215
    graph_body = "".join(graph.body)
    assert "leaf" in graph_body
    assert "gain" in graph_body
    assert "value" in graph_body
    assert "weight" in graph_body
    assert "#ffdddd" in graph_body
    assert "#ddffdd" in graph_body
    assert "data" not in graph_body
    assert "count" not in graph_body


@pytest.mark.skipif(not GRAPHVIZ_INSTALLED, reason="graphviz is not installed")
216
217
218
219
220
221
222
223
224
225
def test_tree_with_categories_below_max_category_values():
    X_train, y_train = _categorical_data(2, 10)
    params = {
        "n_estimators": 10,
        "num_leaves": 3,
        "min_data_in_bin": 1,
        "force_col_wise": True,
        "deterministic": True,
        "num_threads": 1,
        "seed": 708,
226
        "verbose": -1,
227
228
229
230
231
232
233
    }
    gbm = lgb.LGBMClassifier(**params)
    gbm.fit(X_train, y_train)

    with pytest.raises(IndexError):
        lgb.create_tree_digraph(gbm, tree_index=83)

234
235
236
237
238
239
240
241
    graph = lgb.create_tree_digraph(
        gbm,
        tree_index=3,
        show_info=["split_gain", "internal_value", "internal_weight"],
        name="Tree4",
        node_attr={"color": "red"},
        max_category_values=10,
    )
242
243
    graph.render(view=False)
    assert isinstance(graph, graphviz.Digraph)
244
    assert graph.name == "Tree4"
245
    assert len(graph.node_attr) == 1
246
    assert graph.node_attr["color"] == "red"
247
248
    assert len(graph.graph_attr) == 0
    assert len(graph.edge_attr) == 0
249
250
251
252
253
254
255
256
    graph_body = "".join(graph.body)
    assert "leaf" in graph_body
    assert "gain" in graph_body
    assert "value" in graph_body
    assert "weight" in graph_body
    assert "data" not in graph_body
    assert "count" not in graph_body
    assert "||...||" not in graph_body
257
258


259
@pytest.mark.skipif(not GRAPHVIZ_INSTALLED, reason="graphviz is not installed")
260
261
262
263
264
265
266
267
268
269
def test_tree_with_categories_above_max_category_values():
    X_train, y_train = _categorical_data(20, 30)
    params = {
        "n_estimators": 10,
        "num_leaves": 3,
        "min_data_in_bin": 1,
        "force_col_wise": True,
        "deterministic": True,
        "num_threads": 1,
        "seed": 708,
270
        "verbose": -1,
271
272
273
274
275
276
277
    }
    gbm = lgb.LGBMClassifier(**params)
    gbm.fit(X_train, y_train)

    with pytest.raises(IndexError):
        lgb.create_tree_digraph(gbm, tree_index=83)

278
279
280
281
282
283
284
285
    graph = lgb.create_tree_digraph(
        gbm,
        tree_index=9,
        show_info=["split_gain", "internal_value", "internal_weight"],
        name="Tree4",
        node_attr={"color": "red"},
        max_category_values=4,
    )
286
287
    graph.render(view=False)
    assert isinstance(graph, graphviz.Digraph)
288
    assert graph.name == "Tree4"
289
    assert len(graph.node_attr) == 1
290
    assert graph.node_attr["color"] == "red"
291
292
    assert len(graph.graph_attr) == 0
    assert len(graph.edge_attr) == 0
293
294
295
296
297
298
299
300
301
302
303
304
    graph_body = "".join(graph.body)
    assert "leaf" in graph_body
    assert "gain" in graph_body
    assert "value" in graph_body
    assert "weight" in graph_body
    assert "data" not in graph_body
    assert "count" not in graph_body
    assert "||...||" in graph_body


@pytest.mark.parametrize("use_missing", [True, False])
@pytest.mark.parametrize("zero_as_missing", [True, False])
305
306
def test_numeric_split_direction(use_missing, zero_as_missing):
    if use_missing and zero_as_missing:
307
        pytest.skip("use_missing and zero_as_missing both set to True")
308
309
310
311
312
313
314
315
316
    X, y = make_synthetic_regression()
    rng = np.random.RandomState(0)
    zero_mask = rng.rand(X.shape[0]) < 0.05
    X[zero_mask, :] = 0
    if use_missing:
        nan_mask = ~zero_mask & (rng.rand(X.shape[0]) < 0.1)
        X[nan_mask, :] = np.nan
    ds = lgb.Dataset(X, y)
    params = {
317
318
319
320
        "num_leaves": 127,
        "min_child_samples": 1,
        "use_missing": use_missing,
        "zero_as_missing": zero_as_missing,
321
322
323
324
325
    }
    bst = lgb.train(params, ds, num_boost_round=1)

    case_with_zero = X[zero_mask][[0]]
    expected_leaf_zero = bst.predict(case_with_zero, pred_leaf=True)[0]
326
327
    node = bst.dump_model()["tree_info"][0]["tree_structure"]
    while "decision_type" in node:
328
        direction = lgb.plotting._determine_direction_for_numeric_split(
329
            case_with_zero[0][node["split_feature"]], node["threshold"], node["missing_type"], node["default_left"]
330
        )
331
332
        node = node["left_child"] if direction == "left" else node["right_child"]
    assert node["leaf_index"] == expected_leaf_zero
333
334
335
336

    if use_missing:
        case_with_nan = X[nan_mask][[0]]
        expected_leaf_nan = bst.predict(case_with_nan, pred_leaf=True)[0]
337
338
        node = bst.dump_model()["tree_info"][0]["tree_structure"]
        while "decision_type" in node:
339
            direction = lgb.plotting._determine_direction_for_numeric_split(
340
                case_with_nan[0][node["split_feature"]], node["threshold"], node["missing_type"], node["default_left"]
341
            )
342
343
            node = node["left_child"] if direction == "left" else node["right_child"]
        assert node["leaf_index"] == expected_leaf_nan
344
345
346
        assert expected_leaf_zero != expected_leaf_nan


347
@pytest.mark.skipif(not GRAPHVIZ_INSTALLED, reason="graphviz is not installed")
348
349
350
351
352
353
def test_example_case_in_tree_digraph():
    rng = np.random.RandomState(0)
    x1 = rng.rand(100)
    cat = rng.randint(1, 3, size=x1.size)
    X = np.vstack([x1, cat]).T
    y = x1 + 2 * cat
354
355
    feature_name = ["x1", "cat"]
    ds = lgb.Dataset(X, y, feature_name=feature_name, categorical_feature=["cat"])
356
357

    num_round = 3
358
    bst = lgb.train({"num_leaves": 7}, ds, num_boost_round=num_round)
359
360
361
362
363
364
365
    mod = bst.dump_model()
    example_case = X[[0]]
    makes_categorical_splits = False
    seen_indices = set()
    for i in range(num_round):
        graph = lgb.create_tree_digraph(bst, example_case=example_case, tree_index=i)
        gbody = graph.body
366
367
368
        node = mod["tree_info"][i]["tree_structure"]
        while "decision_type" in node:  # iterate through the splits
            split_index = node["split_index"]
369

370
            node_in_graph = [n for n in gbody if f"split{split_index}" in n and "->" not in n]
371
372
373
            assert len(node_in_graph) == 1
            seen_indices.add(gbody.index(node_in_graph[0]))

374
375
            edge_to_node = [e for e in gbody if f"-> split{split_index}" in e]
            if node["decision_type"] == "<=":
376
                direction = lgb.plotting._determine_direction_for_numeric_split(
377
378
379
380
381
                    example_case[0][node["split_feature"]],
                    node["threshold"],
                    node["missing_type"],
                    node["default_left"],
                )
382
383
384
            else:
                makes_categorical_splits = True
                direction = lgb.plotting._determine_direction_for_categorical_split(
385
                    example_case[0][node["split_feature"]], node["threshold"]
386
                )
387
388
            node = node["left_child"] if direction == "left" else node["right_child"]
            assert "color=blue" in node_in_graph[0]
389
390
            if edge_to_node:
                assert len(edge_to_node) == 1
391
                assert "color=blue" in edge_to_node[0]
392
393
                seen_indices.add(gbody.index(edge_to_node[0]))
        # we're in a leaf now
394
395
396
        leaf_index = node["leaf_index"]
        leaf_in_graph = [n for n in gbody if f"leaf{leaf_index}" in n and "->" not in n]
        edge_to_leaf = [e for e in gbody if f"-> leaf{leaf_index}" in e]
397
        assert len(leaf_in_graph) == 1
398
        assert "color=blue" in leaf_in_graph[0]
399
        assert len(edge_to_leaf) == 1
400
        assert "color=blue" in edge_to_leaf[0]
401
402
403
        seen_indices.update([gbody.index(leaf_in_graph[0]), gbody.index(edge_to_leaf[0])])

        # check that the rest of the elements have black color
404
405
        remaining_elements = [e for i, e in enumerate(graph.body) if i not in seen_indices and "graph" not in e]
        assert all("color=black" in e for e in remaining_elements)
406
407
408
409
410
411
412

        # check that we got to the expected leaf
        expected_leaf = bst.predict(example_case, start_iteration=i, num_iteration=1, pred_leaf=True)[0]
        assert leaf_index == expected_leaf
    assert makes_categorical_splits


413
414
@pytest.mark.skipif(not GRAPHVIZ_INSTALLED, reason="graphviz is not installed")
@pytest.mark.parametrize("input_type", ["array", "dataframe"])
415
416
def test_empty_example_case_on_tree_digraph_raises_error(input_type):
    X, y = make_synthetic_regression()
417
    if input_type == "dataframe":
418
        if not PANDAS_INSTALLED:
419
            pytest.skip(reason="pandas is not installed")
420
421
        X = pd_DataFrame(X)
    ds = lgb.Dataset(X, y)
422
    bst = lgb.train({"num_leaves": 3}, ds, num_boost_round=1)
423
    example_case = X[:0]
424
    if input_type == "dataframe":
425
        example_case = pd_DataFrame(example_case)
426
    with pytest.raises(ValueError, match="example_case must have a single row."):
427
428
429
        lgb.create_tree_digraph(bst, tree_index=0, example_case=example_case)


430
@pytest.mark.skipif(not MATPLOTLIB_INSTALLED, reason="matplotlib is not installed")
431
432
433
434
435
436
def test_plot_metrics(params, breast_cancer_split, train_data):
    X_train, X_test, y_train, y_test = breast_cancer_split
    test_data = lgb.Dataset(X_test, y_test, reference=train_data)
    params.update({"metric": {"binary_logloss", "binary_error"}})

    evals_result0 = {}
437
438
439
440
441
442
443
444
    lgb.train(
        params,
        train_data,
        valid_sets=[train_data, test_data],
        valid_names=["v1", "v2"],
        num_boost_round=10,
        callbacks=[lgb.record_evaluation(evals_result0)],
    )
445
446
    with pytest.warns(UserWarning, match="More than one metric available, picking one to plot."):
        ax0 = lgb.plot_metric(evals_result0)
447
    assert isinstance(ax0, matplotlib.axes.Axes)
448
449
450
    assert ax0.get_title() == "Metric during training"
    assert ax0.get_xlabel() == "Iterations"
    assert ax0.get_ylabel() in {"binary_logloss", "binary_error"}
451
452
    legend_items = ax0.get_legend().get_texts()
    assert len(legend_items) == 2
453
454
    assert legend_items[0].get_text() == "v1"
    assert legend_items[1].get_text() == "v2"
455

456
    ax1 = lgb.plot_metric(evals_result0, metric="binary_error")
457
    assert isinstance(ax1, matplotlib.axes.Axes)
458
459
460
    assert ax1.get_title() == "Metric during training"
    assert ax1.get_xlabel() == "Iterations"
    assert ax1.get_ylabel() == "binary_error"
461
462
    legend_items = ax1.get_legend().get_texts()
    assert len(legend_items) == 2
463
464
    assert legend_items[0].get_text() == "v1"
    assert legend_items[1].get_text() == "v2"
465

466
    ax2 = lgb.plot_metric(evals_result0, metric="binary_logloss", dataset_names=["v2"])
467
    assert isinstance(ax2, matplotlib.axes.Axes)
468
469
470
    assert ax2.get_title() == "Metric during training"
    assert ax2.get_xlabel() == "Iterations"
    assert ax2.get_ylabel() == "binary_logloss"
471
472
    legend_items = ax2.get_legend().get_texts()
    assert len(legend_items) == 1
473
    assert legend_items[0].get_text() == "v2"
474
475
476

    ax3 = lgb.plot_metric(
        evals_result0,
477
478
479
480
        metric="binary_logloss",
        dataset_names=["v1"],
        title="Metric @metric@",
        xlabel="Iterations @metric@",
481
482
483
        ylabel='Value of "@metric@"',
        figsize=(5, 5),
        dpi=600,
484
        grid=False,
485
486
    )
    assert isinstance(ax3, matplotlib.axes.Axes)
487
488
    assert ax3.get_title() == "Metric @metric@"
    assert ax3.get_xlabel() == "Iterations @metric@"
489
490
491
    assert ax3.get_ylabel() == 'Value of "binary_logloss"'
    legend_items = ax3.get_legend().get_texts()
    assert len(legend_items) == 1
492
    assert legend_items[0].get_text() == "v1"
493
494
495
496
497
498
499
    assert ax3.get_figure().get_figheight() == 5
    assert ax3.get_figure().get_figwidth() == 5
    assert ax3.get_figure().get_dpi() == 600
    for grid_line in ax3.get_xgridlines():
        assert not grid_line.get_visible()
    for grid_line in ax3.get_ygridlines():
        assert not grid_line.get_visible()
500
501

    evals_result1 = {}
502
    lgb.train(params, train_data, num_boost_round=10, callbacks=[lgb.record_evaluation(evals_result1)])
503
    with pytest.raises(ValueError, match="eval results cannot be empty."):
504
505
        lgb.plot_metric(evals_result1)

506
    gbm2 = lgb.LGBMClassifier(n_estimators=10, num_leaves=3, verbose=-1)
507
    gbm2.fit(X_train, y_train, eval_set=[(X_test, y_test)])
508
509
    ax4 = lgb.plot_metric(gbm2, title=None, xlabel=None, ylabel=None)
    assert isinstance(ax4, matplotlib.axes.Axes)
510
511
512
    assert ax4.get_title() == ""
    assert ax4.get_xlabel() == ""
    assert ax4.get_ylabel() == ""
513
514
    legend_items = ax4.get_legend().get_texts()
    assert len(legend_items) == 1
515
    assert legend_items[0].get_text() == "valid_0"