sklearn.py 33.5 KB
Newer Older
wxchan's avatar
wxchan committed
1
# coding: utf-8
2
# pylint: disable = invalid-name, W0105, C0111, C0301
wxchan's avatar
wxchan committed
3
4
"""Scikit-Learn Wrapper interface for LightGBM."""
from __future__ import absolute_import
5

wxchan's avatar
wxchan committed
6
import numpy as np
7

wxchan's avatar
wxchan committed
8
9
10
11
from .basic import Dataset, LightGBMError
from .compat import (SKLEARN_INSTALLED, LGBMClassifierBase, LGBMDeprecated,
                     LGBMLabelEncoder, LGBMModelBase, LGBMRegressorBase, argc_,
                     range_)
wxchan's avatar
wxchan committed
12
from .engine import train
13

wxchan's avatar
wxchan committed
14

15
def _objective_function_wrapper(func):
wxchan's avatar
wxchan committed
16
17
18
19
20
21
22
    """Decorate an objective function
    Note: for multi-class task, the y_pred is group by class_id first, then group by row_id
          if you want to get i-th row y_pred in j-th class, the access way is y_pred[j*num_data+i]
          and you should group grad and hess in this way as well
    Parameters
    ----------
    func: callable
23
24
25
        Expects a callable with signature ``func(y_true, y_pred)`` or ``func(y_true, y_pred, group):
            y_true: array_like of shape [n_samples]
                The target values
26
            y_pred: array_like of shape [n_samples] or shape[n_samples * n_class] (for multi-class)
27
28
29
                The predicted values
            group: array_like
                group/query data, used for ranking task
wxchan's avatar
wxchan committed
30
31
32
33
34
35
36

    Returns
    -------
    new_func: callable
        The new objective function as expected by ``lightgbm.engine.train``.
        The signature is ``new_func(preds, dataset)``:

37
        preds: array_like, shape [n_samples] or shape[n_samples * n_class]
wxchan's avatar
wxchan committed
38
39
40
41
42
43
44
45
            The predicted values
        dataset: ``dataset``
            The training set from which the labels will be extracted using
            ``dataset.get_label()``
    """
    def inner(preds, dataset):
        """internal function"""
        labels = dataset.get_label()
wxchan's avatar
wxchan committed
46
        argc = argc_(func)
47
48
49
50
51
        if argc == 2:
            grad, hess = func(labels, preds)
        elif argc == 3:
            grad, hess = func(labels, preds, dataset.get_group())
        else:
wxchan's avatar
wxchan committed
52
            raise TypeError("Self-defined objective function should have 2 or 3 arguments, got %d" % argc)
wxchan's avatar
wxchan committed
53
54
55
56
57
58
59
60
61
62
63
        """weighted for objective"""
        weight = dataset.get_weight()
        if weight is not None:
            """only one class"""
            if len(weight) == len(grad):
                grad = np.multiply(grad, weight)
                hess = np.multiply(hess, weight)
            else:
                num_data = len(weight)
                num_class = len(grad) // num_data
                if num_class * num_data != len(grad):
64
                    raise ValueError("Length of grad and hess should equal to num_class * num_data")
wxchan's avatar
wxchan committed
65
66
                for k in range_(num_class):
                    for i in range_(num_data):
wxchan's avatar
wxchan committed
67
68
69
70
71
72
                        idx = k * num_data + i
                        grad[idx] *= weight[i]
                        hess[idx] *= weight[i]
        return grad, hess
    return inner

wxchan's avatar
wxchan committed
73

74
75
76
77
78
79
80
def _eval_function_wrapper(func):
    """Decorate an eval function
    Note: for multi-class task, the y_pred is group by class_id first, then group by row_id
          if you want to get i-th row y_pred in j-th class, the access way is y_pred[j*num_data+i]
    Parameters
    ----------
    func: callable
81
82
83
84
85
        Expects a callable with following functions:
            ``func(y_true, y_pred)``,
            ``func(y_true, y_pred, weight)``
         or ``func(y_true, y_pred, weight, group)``
            and return (eval_name->str, eval_result->float, is_bigger_better->Bool):
86
87
88

            y_true: array_like of shape [n_samples]
                The target values
89
            y_pred: array_like of shape [n_samples] or shape[n_samples * n_class] (for multi-class)
90
91
92
93
94
95
96
97
98
99
100
101
                The predicted values
            weight: array_like of shape [n_samples]
                The weight of samples
            group: array_like
                group/query data, used for ranking task

    Returns
    -------
    new_func: callable
        The new eval function as expected by ``lightgbm.engine.train``.
        The signature is ``new_func(preds, dataset)``:

102
        preds: array_like, shape [n_samples] or shape[n_samples * n_class]
103
104
105
106
107
108
109
110
            The predicted values
        dataset: ``dataset``
            The training set from which the labels will be extracted using
            ``dataset.get_label()``
    """
    def inner(preds, dataset):
        """internal function"""
        labels = dataset.get_label()
wxchan's avatar
wxchan committed
111
        argc = argc_(func)
112
113
114
115
116
117
118
        if argc == 2:
            return func(labels, preds)
        elif argc == 3:
            return func(labels, preds, dataset.get_weight())
        elif argc == 4:
            return func(labels, preds, dataset.get_weight(), dataset.get_group())
        else:
wxchan's avatar
wxchan committed
119
            raise TypeError("Self-defined eval function should have 2, 3 or 4 arguments, got %d" % argc)
120
121
    return inner

wxchan's avatar
wxchan committed
122

wxchan's avatar
wxchan committed
123
124
class LGBMModel(LGBMModelBase):

125
    def __init__(self, boosting_type="gbdt", num_leaves=31, max_depth=-1,
wxchan's avatar
wxchan committed
126
                 learning_rate=0.1, n_estimators=10, max_bin=255,
wxchan's avatar
wxchan committed
127
128
                 subsample_for_bin=50000, objective="regression",
                 min_split_gain=0, min_child_weight=5, min_child_samples=10,
wxchan's avatar
wxchan committed
129
130
                 subsample=1, subsample_freq=1, colsample_bytree=1,
                 reg_alpha=0, reg_lambda=0, scale_pos_weight=1,
wxchan's avatar
wxchan committed
131
                 is_unbalance=False, seed=0, nthread=-1, silent=True,
132
                 sigmoid=1.0, huber_delta=1.0, gaussian_eta=1.0, fair_c=1.0,
133
                 poisson_max_delta_step=0.7,
134
                 max_position=20, label_gain=None,
135
                 drop_rate=0.1, skip_drop=0.5, max_drop=50,
136
                 uniform_drop=False, xgboost_dart_mode=False, use_missing=True):
wxchan's avatar
wxchan committed
137
138
139
140
141
        """
        Implementation of the Scikit-Learn API for LightGBM.

        Parameters
        ----------
142
143
144
        boosting_type : string
            gbdt, traditional Gradient Boosting Decision Tree
            dart, Dropouts meet Multiple Additive Regression Trees
wxchan's avatar
wxchan committed
145
146
147
148
149
150
151
152
        num_leaves : int
            Maximum tree leaves for base learners.
        max_depth : int
            Maximum tree depth for base learners, -1 means no limit.
        learning_rate : float
            Boosting learning rate
        n_estimators : int
            Number of boosted trees to fit.
Guolin Ke's avatar
Guolin Ke committed
153
154
        max_bin : int
            Number of bucketed bin for feature values
wxchan's avatar
wxchan committed
155
156
        subsample_for_bin : int
            Number of samples for constructing bins.
wxchan's avatar
wxchan committed
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
        objective : string or callable
            Specify the learning task and the corresponding learning objective or
            a custom objective function to be used (see note below).
            default: binary for LGBMClassifier, lambdarank for LGBMRanker
        min_split_gain : float
            Minimum loss reduction required to make a further partition on a leaf node of the tree.
        min_child_weight : int
            Minimum sum of instance weight(hessian) needed in a child(leaf)
        min_child_samples : int
            Minimum number of data need in a child(leaf)
        subsample : float
            Subsample ratio of the training instance.
        subsample_freq : int
            frequence of subsample, <=0 means no enable
        colsample_bytree : float
            Subsample ratio of columns when constructing each tree.
        reg_alpha : float
            L1 regularization term on weights
        reg_lambda : float
            L2 regularization term on weights
        scale_pos_weight : float
            Balancing of positive and negative weights.
        is_unbalance : bool
            Is unbalance for binary classification
        seed : int
            Random number seed.
wxchan's avatar
wxchan committed
183
184
185
186
187
188
        nthread : int
            Number of parallel threads
        silent : boolean
            Whether to print messages while running boosting.
        sigmoid : float
            Only used in binary classification and lambdarank. Parameter for sigmoid function.
Tsukasa OMOTO's avatar
Tsukasa OMOTO committed
189
190
        huber_delta : float
            Only used in regression. Parameter for Huber loss function.
191
192
193
        gaussian_eta : float
            Only used in regression. Parameter for L1 and Huber loss function.
            It is used to control the width of Gaussian function to approximate hessian.
Tsukasa OMOTO's avatar
Tsukasa OMOTO committed
194
195
        fair_c : float
            Only used in regression. Parameter for Fair loss function.
196
197
        poisson_max_delta_step : float
            parameter used to safeguard optimization in Poisson regression.
wxchan's avatar
wxchan committed
198
199
200
201
202
203
        max_position : int
            Only used in lambdarank, will optimize NDCG at this position.
        label_gain : list of float
            Only used in lambdarank, relevant gain for labels.
            For example, the gain of label 2 is 3 if using default label gains.
            None (default) means use default value of CLI version: {0,1,3,7,15,31,63,...}.
204
205
206
207
208
209
210
211
212
213
        drop_rate : float
            Only used when boosting_type='dart'. Probablity to select dropping trees.
        skip_drop : float
            Only used when boosting_type='dart'. Probablity to skip dropping trees.
        max_drop : int
            Only used when boosting_type='dart'. Max number of dropped trees in one iteration.
        uniform_drop : bool
            Only used when boosting_type='dart'. If true, drop trees uniformly, else drop according to weights.
        xgboost_dart_mode : bool
            Only used when boosting_type='dart'. Whether use xgboost dart mode.
214
215
        use_missing : bool
            Set to False will disbale the special handle of missing value (default: True).
wxchan's avatar
wxchan committed
216
217
218
219
220

        Note
        ----
        A custom objective function can be provided for the ``objective``
        parameter. In this case, it should have the signature
221
        ``objective(y_true, y_pred) -> grad, hess``
wxchan's avatar
wxchan committed
222
223
224
225
            or ``objective(y_true, y_pred, group) -> grad, hess``:

            y_true: array_like of shape [n_samples]
                The target values
226
            y_pred: array_like of shape [n_samples] or shape[n_samples * n_class]
wxchan's avatar
wxchan committed
227
228
229
                The predicted values
            group: array_like
                group/query data, used for ranking task
230
            grad: array_like of shape [n_samples] or shape[n_samples * n_class]
wxchan's avatar
wxchan committed
231
                The value of the gradient for each sample point.
232
            hess: array_like of shape [n_samples] or shape[n_samples * n_class]
wxchan's avatar
wxchan committed
233
234
235
236
237
238
                The value of the second derivative for each sample point

        for multi-class task, the y_pred is group by class_id first, then group by row_id
            if you want to get i-th row y_pred in j-th class, the access way is y_pred[j*num_data+i]
            and you should group grad and hess in this way as well
        """
wxchan's avatar
wxchan committed
239
        if not SKLEARN_INSTALLED:
240
            raise LightGBMError('Scikit-learn is required for this module')
wxchan's avatar
wxchan committed
241

242
        self.boosting_type = boosting_type
wxchan's avatar
wxchan committed
243
244
245
246
247
        self.num_leaves = num_leaves
        self.max_depth = max_depth
        self.learning_rate = learning_rate
        self.n_estimators = n_estimators
        self.max_bin = max_bin
wxchan's avatar
wxchan committed
248
        self.subsample_for_bin = subsample_for_bin
wxchan's avatar
wxchan committed
249
250
251
252
253
254
255
256
257
258
259
260
        self.objective = objective
        self.min_split_gain = min_split_gain
        self.min_child_weight = min_child_weight
        self.min_child_samples = min_child_samples
        self.subsample = subsample
        self.subsample_freq = subsample_freq
        self.colsample_bytree = colsample_bytree
        self.reg_alpha = reg_alpha
        self.reg_lambda = reg_lambda
        self.scale_pos_weight = scale_pos_weight
        self.is_unbalance = is_unbalance
        self.seed = seed
wxchan's avatar
wxchan committed
261
262
263
        self.nthread = nthread
        self.silent = silent
        self.sigmoid = sigmoid
Tsukasa OMOTO's avatar
Tsukasa OMOTO committed
264
        self.huber_delta = huber_delta
265
        self.gaussian_eta = gaussian_eta
Tsukasa OMOTO's avatar
Tsukasa OMOTO committed
266
        self.fair_c = fair_c
267
        self.poisson_max_delta_step = poisson_max_delta_step
wxchan's avatar
wxchan committed
268
269
        self.max_position = max_position
        self.label_gain = label_gain
270
271
272
273
274
        self.drop_rate = drop_rate
        self.skip_drop = skip_drop
        self.max_drop = max_drop
        self.uniform_drop = uniform_drop
        self.xgboost_dart_mode = xgboost_dart_mode
275
        self.use_missing = use_missing
wxchan's avatar
wxchan committed
276
        self._Booster = None
277
        self.evals_result = None
278
        self.best_iteration = -1
wxchan's avatar
wxchan committed
279
        self.best_score = {}
wxchan's avatar
wxchan committed
280
        if callable(self.objective):
281
            self.fobj = _objective_function_wrapper(self.objective)
wxchan's avatar
wxchan committed
282
283
284
        else:
            self.fobj = None

Guolin Ke's avatar
Guolin Ke committed
285
    def fit(self, X, y,
286
            sample_weight=None, init_score=None, group=None,
287
            eval_set=None, eval_names=None, eval_sample_weight=None,
Guolin Ke's avatar
Guolin Ke committed
288
289
            eval_init_score=None, eval_group=None,
            eval_metric=None,
wxchan's avatar
wxchan committed
290
            early_stopping_rounds=None, verbose=True,
291
            feature_name='auto', categorical_feature='auto',
292
            callbacks=None):
wxchan's avatar
wxchan committed
293
294
295
296
297
298
299
300
301
        """
        Fit the gradient boosting model

        Parameters
        ----------
        X : array_like
            Feature matrix
        y : array_like
            Labels
Guolin Ke's avatar
Guolin Ke committed
302
303
304
305
306
307
        sample_weight : array_like
            weight of training data
        init_score : array_like
            init score of training data
        group : array_like
            group data of training data
wxchan's avatar
wxchan committed
308
309
        eval_set : list, optional
            A list of (X, y) tuple pairs to use as a validation set for early-stopping
310
311
        eval_names: list of string
            Names of eval_set
Guolin Ke's avatar
Guolin Ke committed
312
313
314
315
316
317
        eval_sample_weight : List of array
            weight of eval data
        eval_init_score : List of array
            init score of eval data
        eval_group : List of array
            group data of eval data
wxchan's avatar
wxchan committed
318
319
        eval_metric : str, list of str, callable, optional
            If a str, should be a built-in evaluation metric to use.
320
            If callable, a custom evaluation metric, see note for more details.
wxchan's avatar
wxchan committed
321
322
323
        early_stopping_rounds : int
        verbose : bool
            If `verbose` and an evaluation set is used, writes the evaluation
wxchan's avatar
wxchan committed
324
        feature_name : list of str, or 'auto'
325
            Feature names
wxchan's avatar
wxchan committed
326
            If 'auto' and data is pandas DataFrame, use data columns name
327
328
329
330
331
        categorical_feature : list of str or int, or 'auto'
            Categorical features,
            type int represents index,
            type str represents feature names (need to specify feature_name as well)
            If 'auto' and data is pandas DataFrame, use pandas categorical columns
332
333
334
        callbacks : list of callback functions
            List of callback functions that are applied at each iteration.
            See Callbacks in Python-API.md for more information.
335
336
337

        Note
        ----
wxchan's avatar
wxchan committed
338
339
340
341
342
        Custom eval function expects a callable with following functions:
            ``func(y_true, y_pred)``, ``func(y_true, y_pred, weight)``
                or ``func(y_true, y_pred, weight, group)``.
            return (eval_name, eval_result, is_bigger_better)
                or list of (eval_name, eval_result, is_bigger_better)
343
344
345

            y_true: array_like of shape [n_samples]
                The target values
wxchan's avatar
wxchan committed
346
            y_pred: array_like of shape [n_samples] or shape[n_samples * n_class] (for multi-class)
347
348
349
350
351
352
353
354
355
356
357
358
359
                The predicted values
            weight: array_like of shape [n_samples]
                The weight of samples
            group: array_like
                group/query data, used for ranking task
            eval_name: str
                name of evaluation
            eval_result: float
                eval result
            is_bigger_better: bool
                is eval result bigger better, e.g. AUC is bigger_better.
        for multi-class task, the y_pred is group by class_id first, then group by row_id
          if you want to get i-th row y_pred in j-th class, the access way is y_pred[j*num_data+i]
wxchan's avatar
wxchan committed
360
361
362
        """
        evals_result = {}
        params = self.get_params()
363
364
365
366
367
        params['verbose'] = -1 if self.silent else 1
        if hasattr(self, 'n_classes_') and self.n_classes_ > 2:
            params['num_class'] = self.n_classes_
        if hasattr(self, 'eval_at'):
            params['ndcg_eval_at'] = self.eval_at
wxchan's avatar
wxchan committed
368
        if self.fobj:
wxchan's avatar
wxchan committed
369
            params['objective'] = 'None'  # objective = nullptr for unknown objective
wxchan's avatar
wxchan committed
370
        if 'label_gain' in params and params['label_gain'] is None:
wxchan's avatar
wxchan committed
371
            del params['label_gain']  # use default of cli version
wxchan's avatar
wxchan committed
372
373

        if callable(eval_metric):
374
            feval = _eval_function_wrapper(eval_metric)
wxchan's avatar
wxchan committed
375
376
        else:
            feval = None
377
            params['metric'] = eval_metric
wxchan's avatar
wxchan committed
378

Guolin Ke's avatar
Guolin Ke committed
379
        def _construct_dataset(X, y, sample_weight, init_score, group, params):
Guolin Ke's avatar
Guolin Ke committed
380
            ret = Dataset(X, label=y, max_bin=self.max_bin, weight=sample_weight, group=group, params=params)
Guolin Ke's avatar
Guolin Ke committed
381
382
383
            ret.set_init_score(init_score)
            return ret

Guolin Ke's avatar
Guolin Ke committed
384
        train_set = _construct_dataset(X, y, sample_weight, init_score, group, params)
Guolin Ke's avatar
Guolin Ke committed
385
386
387
388
389
390
391
392
393
394

        valid_sets = []
        if eval_set is not None:
            if isinstance(eval_set, tuple):
                eval_set = [eval_set]
            for i, valid_data in enumerate(eval_set):
                """reduce cost for prediction training data"""
                if valid_data[0] is X and valid_data[1] is y:
                    valid_set = train_set
                else:
Tsukasa OMOTO's avatar
Tsukasa OMOTO committed
395
396
397
398
                    def get_meta_data(collection, i):
                        if collection is None:
                            return None
                        elif isinstance(collection, list):
399
                            return collection[i] if len(collection) > i else None
Tsukasa OMOTO's avatar
Tsukasa OMOTO committed
400
401
402
403
404
405
406
                        elif isinstance(collection, dict):
                            return collection.get(i, None)
                        else:
                            raise TypeError('eval_sample_weight, eval_init_score, and eval_group should be dict or list')
                    valid_weight = get_meta_data(eval_sample_weight, i)
                    valid_init_score = get_meta_data(eval_init_score, i)
                    valid_group = get_meta_data(eval_group, i)
Guolin Ke's avatar
Guolin Ke committed
407
                    valid_set = _construct_dataset(valid_data[0], valid_data[1], valid_weight, valid_init_score, valid_group, params)
Guolin Ke's avatar
Guolin Ke committed
408
409
410
                valid_sets.append(valid_set)

        self._Booster = train(params, train_set,
411
                              self.n_estimators, valid_sets=valid_sets, valid_names=eval_names,
wxchan's avatar
wxchan committed
412
413
                              early_stopping_rounds=early_stopping_rounds,
                              evals_result=evals_result, fobj=self.fobj, feval=feval,
Guolin Ke's avatar
Guolin Ke committed
414
                              verbose_eval=verbose, feature_name=feature_name,
415
                              categorical_feature=categorical_feature,
416
                              callbacks=callbacks)
wxchan's avatar
wxchan committed
417
418

        if evals_result:
419
            self.evals_result = evals_result
wxchan's avatar
wxchan committed
420
421
422

        if early_stopping_rounds is not None:
            self.best_iteration = self._Booster.best_iteration
wxchan's avatar
wxchan committed
423
        self.best_score = self._Booster.best_score
wxchan's avatar
wxchan committed
424
425
426
427

        # free dataset
        self.booster_.free_dataset()
        del train_set, valid_sets
wxchan's avatar
wxchan committed
428
429
        return self

430
    def predict(self, X, raw_score=False, num_iteration=0):
wxchan's avatar
wxchan committed
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
        """
        Return the predicted value for each sample.

        Parameters
        ----------
        X : array_like, shape=[n_samples, n_features]
            Input features matrix.

        num_iteration : int
            Limit number of iterations in the prediction; defaults to 0 (use all trees).

        Returns
        -------
        predicted_result : array_like, shape=[n_samples] or [n_samples, n_classes]
        """
446
        return self.booster_.predict(X, raw_score=raw_score, num_iteration=num_iteration)
wxchan's avatar
wxchan committed
447
448

    def apply(self, X, num_iteration=0):
wxchan's avatar
wxchan committed
449
450
        """
        Return the predicted leaf every tree for each sample.
wxchan's avatar
wxchan committed
451
452
453
454
455
456

        Parameters
        ----------
        X : array_like, shape=[n_samples, n_features]
            Input features matrix.

wxchan's avatar
wxchan committed
457
458
        num_iteration : int
            Limit number of iterations in the prediction; defaults to 0 (use all trees).
wxchan's avatar
wxchan committed
459
460
461
462
463

        Returns
        -------
        X_leaves : array_like, shape=[n_samples, n_trees]
        """
464
        return self.booster_.predict(X, pred_leaf=True, num_iteration=num_iteration)
wxchan's avatar
wxchan committed
465

466
467
468
469
470
471
    @property
    def booster_(self):
        """Get the underlying lightgbm Booster of this model."""
        if self._Booster is None:
            raise LightGBMError('No booster found. Need to call fit beforehand.')
        return self._Booster
wxchan's avatar
wxchan committed
472

473
474
475
476
477
478
479
480
    @property
    def evals_result_(self):
        """Get the evaluation results."""
        if self.evals_result is None:
            raise LightGBMError('No results found. Need to call fit with eval set beforehand.')
        return self.evals_result

    @property
481
    def feature_importances_(self):
482
483
484
        """Get normailized feature importances."""
        importace_array = self.booster_.feature_importance().astype(np.float32)
        return importace_array / importace_array.sum()
wxchan's avatar
wxchan committed
485

wxchan's avatar
wxchan committed
486
    @LGBMDeprecated('Use attribute booster_ instead.')
487
488
    def booster(self):
        return self.booster_
wxchan's avatar
wxchan committed
489

490
    @LGBMDeprecated('Use attribute feature_importances_ instead.')
491
    def feature_importance(self):
492
        return self.feature_importances_
wxchan's avatar
wxchan committed
493

wxchan's avatar
wxchan committed
494

wxchan's avatar
wxchan committed
495
496
class LGBMRegressor(LGBMModel, LGBMRegressorBase):

Tsukasa OMOTO's avatar
Tsukasa OMOTO committed
497
498
499
500
501
502
503
    def __init__(self, boosting_type="gbdt", num_leaves=31, max_depth=-1,
                 learning_rate=0.1, n_estimators=10, max_bin=255,
                 subsample_for_bin=50000, objective="regression",
                 min_split_gain=0, min_child_weight=5, min_child_samples=10,
                 subsample=1, subsample_freq=1, colsample_bytree=1,
                 reg_alpha=0, reg_lambda=0,
                 seed=0, nthread=-1, silent=True,
504
                 huber_delta=1.0, gaussian_eta=1.0, fair_c=1.0,
505
                 poisson_max_delta_step=0.7,
Tsukasa OMOTO's avatar
Tsukasa OMOTO committed
506
                 drop_rate=0.1, skip_drop=0.5, max_drop=50,
507
                 uniform_drop=False, xgboost_dart_mode=False, use_missing=True):
Tsukasa OMOTO's avatar
Tsukasa OMOTO committed
508
509
510
511
512
513
514
515
516
        super(LGBMRegressor, self).__init__(boosting_type=boosting_type, num_leaves=num_leaves,
                                            max_depth=max_depth, learning_rate=learning_rate,
                                            n_estimators=n_estimators, max_bin=max_bin,
                                            subsample_for_bin=subsample_for_bin, objective=objective,
                                            min_split_gain=min_split_gain, min_child_weight=min_child_weight,
                                            min_child_samples=min_child_samples, subsample=subsample,
                                            subsample_freq=subsample_freq, colsample_bytree=colsample_bytree,
                                            reg_alpha=reg_alpha, reg_lambda=reg_lambda,
                                            seed=seed, nthread=nthread, silent=silent,
517
                                            huber_delta=huber_delta, gaussian_eta=gaussian_eta, fair_c=fair_c,
518
                                            poisson_max_delta_step=poisson_max_delta_step,
Tsukasa OMOTO's avatar
Tsukasa OMOTO committed
519
                                            drop_rate=drop_rate, skip_drop=skip_drop, max_drop=max_drop,
520
521
                                            uniform_drop=uniform_drop, xgboost_dart_mode=xgboost_dart_mode,
                                            use_missing=use_missing)
Tsukasa OMOTO's avatar
Tsukasa OMOTO committed
522

Guolin Ke's avatar
Guolin Ke committed
523
524
    def fit(self, X, y,
            sample_weight=None, init_score=None,
525
            eval_set=None, eval_names=None, eval_sample_weight=None,
Guolin Ke's avatar
Guolin Ke committed
526
            eval_init_score=None,
wxchan's avatar
wxchan committed
527
            eval_metric="l2",
Guolin Ke's avatar
Guolin Ke committed
528
            early_stopping_rounds=None, verbose=True,
529
            feature_name='auto', categorical_feature='auto', callbacks=None):
530
531
532

        super(LGBMRegressor, self).fit(X, y, sample_weight=sample_weight,
                                       init_score=init_score, eval_set=eval_set,
533
                                       eval_names=eval_names,
534
535
536
537
538
                                       eval_sample_weight=eval_sample_weight,
                                       eval_init_score=eval_init_score,
                                       eval_metric=eval_metric,
                                       early_stopping_rounds=early_stopping_rounds,
                                       verbose=verbose, feature_name=feature_name,
539
                                       categorical_feature=categorical_feature,
Guolin Ke's avatar
Guolin Ke committed
540
                                       callbacks=callbacks)
Guolin Ke's avatar
Guolin Ke committed
541
542
        return self

wxchan's avatar
wxchan committed
543

wxchan's avatar
wxchan committed
544
545
class LGBMClassifier(LGBMModel, LGBMClassifierBase):

546
    def __init__(self, boosting_type="gbdt", num_leaves=31, max_depth=-1,
Guolin Ke's avatar
Guolin Ke committed
547
                 learning_rate=0.1, n_estimators=10, max_bin=255,
wxchan's avatar
wxchan committed
548
549
                 subsample_for_bin=50000, objective="binary",
                 min_split_gain=0, min_child_weight=5, min_child_samples=10,
Guolin Ke's avatar
Guolin Ke committed
550
551
                 subsample=1, subsample_freq=1, colsample_bytree=1,
                 reg_alpha=0, reg_lambda=0, scale_pos_weight=1,
wxchan's avatar
wxchan committed
552
553
                 is_unbalance=False, seed=0, nthread=-1,
                 silent=True, sigmoid=1.0,
554
                 drop_rate=0.1, skip_drop=0.5, max_drop=50,
555
                 uniform_drop=False, xgboost_dart_mode=False, use_missing=True):
556
        self.classes, self.n_classes = None, None
557
558
559
        super(LGBMClassifier, self).__init__(boosting_type=boosting_type, num_leaves=num_leaves,
                                             max_depth=max_depth, learning_rate=learning_rate,
                                             n_estimators=n_estimators, max_bin=max_bin,
wxchan's avatar
wxchan committed
560
                                             subsample_for_bin=subsample_for_bin, objective=objective,
561
562
563
564
                                             min_split_gain=min_split_gain, min_child_weight=min_child_weight,
                                             min_child_samples=min_child_samples, subsample=subsample,
                                             subsample_freq=subsample_freq, colsample_bytree=colsample_bytree,
                                             reg_alpha=reg_alpha, reg_lambda=reg_lambda,
wxchan's avatar
wxchan committed
565
566
                                             scale_pos_weight=scale_pos_weight, is_unbalance=is_unbalance,
                                             seed=seed, nthread=nthread, silent=silent, sigmoid=sigmoid,
567
                                             drop_rate=drop_rate, skip_drop=skip_drop, max_drop=max_drop,
568
569
                                             uniform_drop=uniform_drop, xgboost_dart_mode=xgboost_dart_mode,
                                             use_missing=use_missing)
Guolin Ke's avatar
Guolin Ke committed
570
571
572

    def fit(self, X, y,
            sample_weight=None, init_score=None,
573
            eval_set=None, eval_names=None, eval_sample_weight=None,
Guolin Ke's avatar
Guolin Ke committed
574
            eval_init_score=None,
wxchan's avatar
wxchan committed
575
            eval_metric="logloss",
wxchan's avatar
wxchan committed
576
            early_stopping_rounds=None, verbose=True,
577
            feature_name='auto', categorical_feature='auto',
578
            callbacks=None):
579
        self._le = LGBMLabelEncoder().fit(y)
580
        _y = self._le.transform(y)
581

582
583
584
        self.classes = self._le.classes_
        self.n_classes = len(self.classes_)
        if self.n_classes > 2:
wxchan's avatar
wxchan committed
585
586
            # Switch to using a multiclass objective in the underlying LGBM instance
            self.objective = "multiclass"
wxchan's avatar
wxchan committed
587
            if eval_metric == 'logloss' or eval_metric == 'binary_logloss':
wxchan's avatar
wxchan committed
588
                eval_metric = "multi_logloss"
wxchan's avatar
wxchan committed
589
590
591
592
593
594
595
            elif eval_metric == 'error' or eval_metric == 'binary_error':
                eval_metric = "multi_error"
        else:
            if eval_metric == 'logloss' or eval_metric == 'multi_logloss':
                eval_metric = 'binary_logloss'
            elif eval_metric == 'error' or eval_metric == 'multi_error':
                eval_metric = 'binary_error'
wxchan's avatar
wxchan committed
596
597

        if eval_set is not None:
598
599
600
601
602
603
604
            if isinstance(eval_set, tuple):
                eval_set = [eval_set]
            for i, (valid_x, valid_y) in enumerate(eval_set):
                if valid_x is X and valid_y is y:
                    eval_set[i] = (valid_x, _y)
                else:
                    eval_set[i] = (valid_x, self._le.transform(valid_y))
605

606
        super(LGBMClassifier, self).fit(X, _y, sample_weight=sample_weight,
607
                                        init_score=init_score, eval_set=eval_set,
608
                                        eval_names=eval_names,
609
610
611
612
613
                                        eval_sample_weight=eval_sample_weight,
                                        eval_init_score=eval_init_score,
                                        eval_metric=eval_metric,
                                        early_stopping_rounds=early_stopping_rounds,
                                        verbose=verbose, feature_name=feature_name,
614
                                        categorical_feature=categorical_feature,
615
                                        callbacks=callbacks)
wxchan's avatar
wxchan committed
616
617
        return self

618
619
620
621
    def predict(self, X, raw_score=False, num_iteration=0):
        class_probs = self.predict_proba(X, raw_score, num_iteration)
        class_index = np.argmax(class_probs, axis=1)
        return self._le.inverse_transform(class_index)
wxchan's avatar
wxchan committed
622

623
    def predict_proba(self, X, raw_score=False, num_iteration=0):
wxchan's avatar
wxchan committed
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
        """
        Return the predicted probability for each class for each sample.

        Parameters
        ----------
        X : array_like, shape=[n_samples, n_features]
            Input features matrix.

        num_iteration : int
            Limit number of iterations in the prediction; defaults to 0 (use all trees).

        Returns
        -------
        predicted_probability : array_like, shape=[n_samples, n_classes]
        """
639
640
        class_probs = self.booster_.predict(X, raw_score=raw_score, num_iteration=num_iteration)
        if self.n_classes > 2:
wxchan's avatar
wxchan committed
641
642
            return class_probs
        else:
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
            return np.vstack((1. - class_probs, class_probs)).transpose()

    @property
    def classes_(self):
        """Get class label array."""
        if self.classes is None:
            raise LightGBMError('No classes found. Need to call fit beforehand.')
        return self.classes

    @property
    def n_classes_(self):
        """Get number of classes"""
        if self.n_classes is None:
            raise LightGBMError('No classes found. Need to call fit beforehand.')
        return self.n_classes
wxchan's avatar
wxchan committed
658

wxchan's avatar
wxchan committed
659

wxchan's avatar
wxchan committed
660
661
class LGBMRanker(LGBMModel):

662
    def __init__(self, boosting_type="gbdt", num_leaves=31, max_depth=-1,
Guolin Ke's avatar
Guolin Ke committed
663
                 learning_rate=0.1, n_estimators=10, max_bin=255,
wxchan's avatar
wxchan committed
664
665
                 subsample_for_bin=50000, objective="lambdarank",
                 min_split_gain=0, min_child_weight=5, min_child_samples=10,
Guolin Ke's avatar
Guolin Ke committed
666
667
                 subsample=1, subsample_freq=1, colsample_bytree=1,
                 reg_alpha=0, reg_lambda=0, scale_pos_weight=1,
wxchan's avatar
wxchan committed
668
669
                 is_unbalance=False, seed=0, nthread=-1, silent=True,
                 sigmoid=1.0, max_position=20, label_gain=None,
670
                 drop_rate=0.1, skip_drop=0.5, max_drop=50,
671
                 uniform_drop=False, xgboost_dart_mode=False, use_missing=True):
672
673
674
        super(LGBMRanker, self).__init__(boosting_type=boosting_type, num_leaves=num_leaves,
                                         max_depth=max_depth, learning_rate=learning_rate,
                                         n_estimators=n_estimators, max_bin=max_bin,
wxchan's avatar
wxchan committed
675
                                         subsample_for_bin=subsample_for_bin, objective=objective,
676
677
678
679
                                         min_split_gain=min_split_gain, min_child_weight=min_child_weight,
                                         min_child_samples=min_child_samples, subsample=subsample,
                                         subsample_freq=subsample_freq, colsample_bytree=colsample_bytree,
                                         reg_alpha=reg_alpha, reg_lambda=reg_lambda,
wxchan's avatar
wxchan committed
680
681
682
                                         scale_pos_weight=scale_pos_weight, is_unbalance=is_unbalance,
                                         seed=seed, nthread=nthread, silent=silent,
                                         sigmoid=sigmoid, max_position=max_position, label_gain=label_gain,
683
                                         drop_rate=drop_rate, skip_drop=skip_drop, max_drop=max_drop,
684
685
                                         uniform_drop=uniform_drop, xgboost_dart_mode=xgboost_dart_mode,
                                         use_missing=use_missing)
Guolin Ke's avatar
Guolin Ke committed
686
687

    def fit(self, X, y,
688
            sample_weight=None, init_score=None, group=None,
689
            eval_set=None, eval_names=None, eval_sample_weight=None,
Guolin Ke's avatar
Guolin Ke committed
690
            eval_init_score=None, eval_group=None,
wxchan's avatar
wxchan committed
691
            eval_metric='ndcg', eval_at=1,
wxchan's avatar
wxchan committed
692
            early_stopping_rounds=None, verbose=True,
693
            feature_name='auto', categorical_feature='auto',
694
            callbacks=None):
Guolin Ke's avatar
Guolin Ke committed
695
        """
wxchan's avatar
wxchan committed
696
        Most arguments like common methods except following:
Guolin Ke's avatar
Guolin Ke committed
697
698
699
700

        eval_at : list of int
            The evaulation positions of NDCG
        """
wxchan's avatar
wxchan committed
701
702

        """check group data"""
Guolin Ke's avatar
Guolin Ke committed
703
        if group is None:
704
            raise ValueError("Should set group for ranking task")
wxchan's avatar
wxchan committed
705
706

        if eval_set is not None:
Guolin Ke's avatar
Guolin Ke committed
707
            if eval_group is None:
708
                raise ValueError("Eval_group cannot be None when eval_set is not None")
Guolin Ke's avatar
Guolin Ke committed
709
            elif len(eval_group) != len(eval_set):
710
                raise ValueError("Length of eval_group should equal to eval_set")
wxchan's avatar
wxchan committed
711
            elif (isinstance(eval_group, dict) and any(i not in eval_group or eval_group[i] is None for i in range_(len(eval_group)))) \
wxchan's avatar
wxchan committed
712
                    or (isinstance(eval_group, list) and any(group is None for group in eval_group)):
713
                raise ValueError("Should set group for all eval dataset for ranking task; if you use dict, the index should start from 0")
714

Guolin Ke's avatar
Guolin Ke committed
715
        if eval_at is not None:
716
717
718
            self.eval_at = eval_at
        super(LGBMRanker, self).fit(X, y, sample_weight=sample_weight,
                                    init_score=init_score, group=group,
719
720
                                    eval_set=eval_set, eval_names=eval_names,
                                    eval_sample_weight=eval_sample_weight,
721
722
723
724
                                    eval_init_score=eval_init_score, eval_group=eval_group,
                                    eval_metric=eval_metric,
                                    early_stopping_rounds=early_stopping_rounds,
                                    verbose=verbose, feature_name=feature_name,
725
                                    categorical_feature=categorical_feature,
726
                                    callbacks=callbacks)
wxchan's avatar
wxchan committed
727
        return self