utils.py 9.32 KB
Newer Older
1
# coding: utf-8
2
import pickle
3
from functools import lru_cache
4
from inspect import getfullargspec
5

6
7
import cloudpickle
import joblib
8
import numpy as np
9
import sklearn.datasets
10
from sklearn.utils import check_random_state
11

12
13
14
15
import lightgbm as lgb

SERIALIZERS = ["pickle", "joblib", "cloudpickle"]

16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34

@lru_cache(maxsize=None)
def load_breast_cancer(**kwargs):
    return sklearn.datasets.load_breast_cancer(**kwargs)


@lru_cache(maxsize=None)
def load_digits(**kwargs):
    return sklearn.datasets.load_digits(**kwargs)


@lru_cache(maxsize=None)
def load_iris(**kwargs):
    return sklearn.datasets.load_iris(**kwargs)


@lru_cache(maxsize=None)
def load_linnerud(**kwargs):
    return sklearn.datasets.load_linnerud(**kwargs)
35
36


37
38
39
def make_ranking(
    n_samples=100, n_features=20, n_informative=5, gmax=2, group=None, random_gs=False, avg_gs=10, random_state=0
):
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
    """Generate a learning-to-rank dataset - feature vectors grouped together with
    integer-valued graded relevance scores. Replace this with a sklearn.datasets function
    if ranking objective becomes supported in sklearn.datasets module.

    Parameters
    ----------
    n_samples : int, optional (default=100)
        Total number of documents (records) in the dataset.
    n_features : int, optional (default=20)
        Total number of features in the dataset.
    n_informative : int, optional (default=5)
        Number of features that are "informative" for ranking, as they are bias + beta * y
        where bias and beta are standard normal variates. If this is greater than n_features, the dataset will have
        n_features features, all will be informative.
    gmax : int, optional (default=2)
        Maximum graded relevance value for creating relevance/target vector. If you set this to 2, for example, all
        documents in a group will have relevance scores of either 0, 1, or 2.
    group : array-like, optional (default=None)
        1-d array or list of group sizes. When `group` is specified, this overrides n_samples, random_gs, and
        avg_gs by simply creating groups with sizes group[0], ..., group[-1].
    random_gs : bool, optional (default=False)
        True will make group sizes ~ Poisson(avg_gs), False will make group sizes == avg_gs.
    avg_gs : int, optional (default=10)
        Average number of documents (records) in each group.
    random_state : int, optional (default=0)
        Random seed.

    Returns
    -------
    X : 2-d np.ndarray of shape = [n_samples (or np.sum(group)), n_features]
        Input feature matrix for ranking objective.
    y : 1-d np.array of shape = [n_samples (or np.sum(group))]
        Integer-graded relevance scores.
    group_ids : 1-d np.array of shape = [n_samples (or np.sum(group))]
        Array of group ids, each value indicates to which group each record belongs.
    """
    rnd_generator = check_random_state(random_state)

    y_vec, group_id_vec = np.empty((0,), dtype=int), np.empty((0,), dtype=int)
    gid = 0

    # build target, group ID vectors.
    relvalues = range(gmax + 1)

    # build y/target and group-id vectors with user-specified group sizes.
85
    if group is not None and hasattr(group, "__len__"):
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
        n_samples = np.sum(group)

        for i, gsize in enumerate(group):
            y_vec = np.concatenate((y_vec, rnd_generator.choice(relvalues, size=gsize, replace=True)))
            group_id_vec = np.concatenate((group_id_vec, [i] * gsize))

    # build y/target and group-id vectors according to n_samples, avg_gs, and random_gs.
    else:
        while len(y_vec) < n_samples:
            gsize = avg_gs if not random_gs else rnd_generator.poisson(avg_gs)

            # groups should contain > 1 element for pairwise learning objective.
            if gsize < 1:
                continue

            y_vec = np.append(y_vec, rnd_generator.choice(relvalues, size=gsize, replace=True))
            group_id_vec = np.append(group_id_vec, [gid] * gsize)
            gid += 1

        y_vec, group_id_vec = y_vec[:n_samples], group_id_vec[:n_samples]

    # build feature data, X. Transform first few into informative features.
    n_informative = max(min(n_features, n_informative), 0)
    X = rnd_generator.uniform(size=(n_samples, n_features))

    for j in range(n_informative):
        bias, coef = rnd_generator.normal(size=2)
        X[:, j] = bias + coef * y_vec

    return X, y_vec, group_id_vec
116
117
118


@lru_cache(maxsize=None)
119
def make_synthetic_regression(n_samples=100, n_features=4, n_informative=2, random_state=42):
120
121
122
    return sklearn.datasets.make_regression(
        n_samples=n_samples, n_features=n_features, n_informative=n_informative, random_state=random_state
    )
123
124


125
126
127
128
129
130
def dummy_obj(preds, train_data):
    return np.ones(preds.shape), np.ones(preds.shape)


def mse_obj(y_pred, dtrain):
    y_true = dtrain.get_label()
131
    grad = y_pred - y_true
132
133
134
135
    hess = np.ones(len(grad))
    return grad, hess


136
137
138
139
140
141
def softmax(x):
    row_wise_max = np.max(x, axis=1).reshape(-1, 1)
    exp_x = np.exp(x - row_wise_max)
    return exp_x / np.sum(exp_x, axis=1).reshape(-1, 1)


142
143
144
145
def logistic_sigmoid(x):
    return 1.0 / (1.0 + np.exp(-x))


146
def sklearn_multiclass_custom_objective(y_true, y_pred, weight=None):
147
148
149
150
151
152
153
    num_rows, num_class = y_pred.shape
    prob = softmax(y_pred)
    grad_update = np.zeros_like(prob)
    grad_update[np.arange(num_rows), y_true.astype(np.int32)] = -1.0
    grad = prob + grad_update
    factor = num_class / (num_class - 1)
    hess = factor * prob * (1 - prob)
154
155
156
157
    if weight is not None:
        weight2d = weight.reshape(-1, 1)
        grad *= weight2d
        hess *= weight2d
158
    return grad, hess
159
160
161


def pickle_obj(obj, filepath, serializer):
162
163
    if serializer == "pickle":
        with open(filepath, "wb") as f:
164
            pickle.dump(obj, f)
165
    elif serializer == "joblib":
166
        joblib.dump(obj, filepath)
167
168
    elif serializer == "cloudpickle":
        with open(filepath, "wb") as f:
169
170
            cloudpickle.dump(obj, f)
    else:
171
        raise ValueError(f"Unrecognized serializer type: {serializer}")
172
173
174


def unpickle_obj(filepath, serializer):
175
176
    if serializer == "pickle":
        with open(filepath, "rb") as f:
177
            return pickle.load(f)
178
    elif serializer == "joblib":
179
        return joblib.load(filepath)
180
181
    elif serializer == "cloudpickle":
        with open(filepath, "rb") as f:
182
183
            return cloudpickle.load(f)
    else:
184
        raise ValueError(f"Unrecognized serializer type: {serializer}")
185
186
187
188


def pickle_and_unpickle_object(obj, serializer):
    with lgb.basic._TempFile() as tmp_file:
189
190
        pickle_obj(obj=obj, filepath=tmp_file.name, serializer=serializer)
        obj_from_disk = unpickle_obj(filepath=tmp_file.name, serializer=serializer)
191
    return obj_from_disk  # noqa: RET504
192
193


194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
def assert_silent(capsys) -> None:
    """
    Given a ``CaptureFixture`` instance (from the ``pytest`` built-in ``capsys`` fixture),
    read the recently-captured data into a variable and assert that nothing was written
    to stdout or stderr.

    This is just here to turn 3 lines of repetitive code into 1.

    Note that this does have a side effect... ``capsys.readouterr()`` copies
    from a buffer then frees it. So it will only store into ``.out`` and ``.err`` the
    captured output since the last time that ``.readouterr()`` was called.

    ref: https://docs.pytest.org/en/stable/how-to/capture-stdout-stderr.html
    """
    captured = capsys.readouterr()
    assert captured.out == "", captured.out
    assert captured.err == "", captured.err


213
214
# doing this here, at import time, to ensure it only runs once_per import
# instead of once per assertion
215
_numpy_testing_supports_strict_kwarg = "strict" in getfullargspec(np.testing.assert_array_equal).kwonlyargs
216
217
218
219
220
221
222
223
224
225
226
227


def np_assert_array_equal(*args, **kwargs):
    """
    np.testing.assert_array_equal() only got the kwarg ``strict`` in June 2022:
    https://github.com/numpy/numpy/pull/21595

    This function is here for testing on older Python (and therefore ``numpy``)
    """
    if not _numpy_testing_supports_strict_kwarg:
        kwargs.pop("strict")
    np.testing.assert_array_equal(*args, **kwargs)
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266


def assert_subtree_valid(root):
    """Recursively checks the validity of a subtree rooted at `root`.

    Currently it only checks whether weights and counts are consistent between
    all parent nodes and their children.

    Parameters
    ----------
    root : dict
        A dictionary representing the root of the subtree.
        It should be produced by dump_model()

    Returns
    -------
    tuple
        A tuple containing the weight and count of the subtree rooted at `root`.
    """
    if "leaf_count" in root:
        return (root["leaf_weight"], root["leaf_count"])

    left_child = root["left_child"]
    right_child = root["right_child"]
    (l_w, l_c) = assert_subtree_valid(left_child)
    (r_w, r_c) = assert_subtree_valid(right_child)
    assert (
        abs(root["internal_weight"] - (l_w + r_w)) <= 1e-3
    ), "root node's internal weight should be approximately the sum of its child nodes' internal weights"
    assert (
        root["internal_count"] == l_c + r_c
    ), "root node's internal count should be exactly the sum of its child nodes' internal counts"
    return (root["internal_weight"], root["internal_count"])


def assert_all_trees_valid(model_dump):
    for idx, tree in enumerate(model_dump["tree_info"]):
        assert tree["tree_index"] == idx, f"tree {idx} should have tree_index={idx}. Full tree: {tree}"
        assert_subtree_valid(tree["tree_structure"])