"src/git@developer.sourcefind.cn:tianlh/lightgbm-dcu.git" did not exist on "76102284d1997ef8b876cb40ae7c3e44b6ffbb66"
utils.py 6.67 KB
Newer Older
1
# coding: utf-8
2
import pickle
3
from functools import lru_cache
4

5
6
import cloudpickle
import joblib
7
import numpy as np
8
import sklearn.datasets
9
from sklearn.utils import check_random_state
10

11
12
13
14
import lightgbm as lgb

SERIALIZERS = ["pickle", "joblib", "cloudpickle"]

15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33

@lru_cache(maxsize=None)
def load_breast_cancer(**kwargs):
    return sklearn.datasets.load_breast_cancer(**kwargs)


@lru_cache(maxsize=None)
def load_digits(**kwargs):
    return sklearn.datasets.load_digits(**kwargs)


@lru_cache(maxsize=None)
def load_iris(**kwargs):
    return sklearn.datasets.load_iris(**kwargs)


@lru_cache(maxsize=None)
def load_linnerud(**kwargs):
    return sklearn.datasets.load_linnerud(**kwargs)
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113


def make_ranking(n_samples=100, n_features=20, n_informative=5, gmax=2,
                 group=None, random_gs=False, avg_gs=10, random_state=0):
    """Generate a learning-to-rank dataset - feature vectors grouped together with
    integer-valued graded relevance scores. Replace this with a sklearn.datasets function
    if ranking objective becomes supported in sklearn.datasets module.

    Parameters
    ----------
    n_samples : int, optional (default=100)
        Total number of documents (records) in the dataset.
    n_features : int, optional (default=20)
        Total number of features in the dataset.
    n_informative : int, optional (default=5)
        Number of features that are "informative" for ranking, as they are bias + beta * y
        where bias and beta are standard normal variates. If this is greater than n_features, the dataset will have
        n_features features, all will be informative.
    gmax : int, optional (default=2)
        Maximum graded relevance value for creating relevance/target vector. If you set this to 2, for example, all
        documents in a group will have relevance scores of either 0, 1, or 2.
    group : array-like, optional (default=None)
        1-d array or list of group sizes. When `group` is specified, this overrides n_samples, random_gs, and
        avg_gs by simply creating groups with sizes group[0], ..., group[-1].
    random_gs : bool, optional (default=False)
        True will make group sizes ~ Poisson(avg_gs), False will make group sizes == avg_gs.
    avg_gs : int, optional (default=10)
        Average number of documents (records) in each group.
    random_state : int, optional (default=0)
        Random seed.

    Returns
    -------
    X : 2-d np.ndarray of shape = [n_samples (or np.sum(group)), n_features]
        Input feature matrix for ranking objective.
    y : 1-d np.array of shape = [n_samples (or np.sum(group))]
        Integer-graded relevance scores.
    group_ids : 1-d np.array of shape = [n_samples (or np.sum(group))]
        Array of group ids, each value indicates to which group each record belongs.
    """
    rnd_generator = check_random_state(random_state)

    y_vec, group_id_vec = np.empty((0,), dtype=int), np.empty((0,), dtype=int)
    gid = 0

    # build target, group ID vectors.
    relvalues = range(gmax + 1)

    # build y/target and group-id vectors with user-specified group sizes.
    if group is not None and hasattr(group, '__len__'):
        n_samples = np.sum(group)

        for i, gsize in enumerate(group):
            y_vec = np.concatenate((y_vec, rnd_generator.choice(relvalues, size=gsize, replace=True)))
            group_id_vec = np.concatenate((group_id_vec, [i] * gsize))

    # build y/target and group-id vectors according to n_samples, avg_gs, and random_gs.
    else:
        while len(y_vec) < n_samples:
            gsize = avg_gs if not random_gs else rnd_generator.poisson(avg_gs)

            # groups should contain > 1 element for pairwise learning objective.
            if gsize < 1:
                continue

            y_vec = np.append(y_vec, rnd_generator.choice(relvalues, size=gsize, replace=True))
            group_id_vec = np.append(group_id_vec, [gid] * gsize)
            gid += 1

        y_vec, group_id_vec = y_vec[:n_samples], group_id_vec[:n_samples]

    # build feature data, X. Transform first few into informative features.
    n_informative = max(min(n_features, n_informative), 0)
    X = rnd_generator.uniform(size=(n_samples, n_features))

    for j in range(n_informative):
        bias, coef = rnd_generator.normal(size=2)
        X[:, j] = bias + coef * y_vec

    return X, y_vec, group_id_vec
114
115
116


@lru_cache(maxsize=None)
117
118
119
def make_synthetic_regression(n_samples=100, n_features=4, n_informative=2, random_state=42):
    return sklearn.datasets.make_regression(n_samples=n_samples, n_features=n_features,
                                            n_informative=n_informative, random_state=random_state)
120
121


122
123
124
125
126
127
128
129
130
131
132
def dummy_obj(preds, train_data):
    return np.ones(preds.shape), np.ones(preds.shape)


def mse_obj(y_pred, dtrain):
    y_true = dtrain.get_label()
    grad = (y_pred - y_true)
    hess = np.ones(len(grad))
    return grad, hess


133
134
135
136
137
138
def softmax(x):
    row_wise_max = np.max(x, axis=1).reshape(-1, 1)
    exp_x = np.exp(x - row_wise_max)
    return exp_x / np.sum(exp_x, axis=1).reshape(-1, 1)


139
140
141
142
def logistic_sigmoid(x):
    return 1.0 / (1.0 + np.exp(-x))


143
def sklearn_multiclass_custom_objective(y_true, y_pred, weight=None):
144
145
146
147
148
149
150
    num_rows, num_class = y_pred.shape
    prob = softmax(y_pred)
    grad_update = np.zeros_like(prob)
    grad_update[np.arange(num_rows), y_true.astype(np.int32)] = -1.0
    grad = prob + grad_update
    factor = num_class / (num_class - 1)
    hess = factor * prob * (1 - prob)
151
152
153
154
    if weight is not None:
        weight2d = weight.reshape(-1, 1)
        grad *= weight2d
        hess *= weight2d
155
    return grad, hess
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181


def pickle_obj(obj, filepath, serializer):
    if serializer == 'pickle':
        with open(filepath, 'wb') as f:
            pickle.dump(obj, f)
    elif serializer == 'joblib':
        joblib.dump(obj, filepath)
    elif serializer == 'cloudpickle':
        with open(filepath, 'wb') as f:
            cloudpickle.dump(obj, f)
    else:
        raise ValueError(f'Unrecognized serializer type: {serializer}')


def unpickle_obj(filepath, serializer):
    if serializer == 'pickle':
        with open(filepath, 'rb') as f:
            return pickle.load(f)
    elif serializer == 'joblib':
        return joblib.load(filepath)
    elif serializer == 'cloudpickle':
        with open(filepath, 'rb') as f:
            return cloudpickle.load(f)
    else:
        raise ValueError(f'Unrecognized serializer type: {serializer}')
182
183
184
185
186
187
188
189
190
191
192
193
194
195


def pickle_and_unpickle_object(obj, serializer):
    with lgb.basic._TempFile() as tmp_file:
        pickle_obj(
            obj=obj,
            filepath=tmp_file.name,
            serializer=serializer
        )
        obj_from_disk = unpickle_obj(
            filepath=tmp_file.name,
            serializer=serializer
        )
    return obj_from_disk