basic.py 109 KB
Newer Older
wxchan's avatar
wxchan committed
1
# coding: utf-8
2
"""Wrapper for C API of LightGBM."""
wxchan's avatar
wxchan committed
3
4
from __future__ import absolute_import

5
import copy
wxchan's avatar
wxchan committed
6
import ctypes
7
import os
wxchan's avatar
wxchan committed
8
import warnings
wxchan's avatar
wxchan committed
9
from tempfile import NamedTemporaryFile
wxchan's avatar
wxchan committed
10
11
12
13

import numpy as np
import scipy.sparse

14
from .compat import (PANDAS_INSTALLED, DataFrame, Series, is_dtype_sparse,
15
                     DataTable,
16
17
                     decode_string, string_type,
                     integer_types, numeric_types,
18
                     json, json_default_with_numpy,
19
                     range_, zip_)
wxchan's avatar
wxchan committed
20
21
from .libpath import find_lib_path

wxchan's avatar
wxchan committed
22

wxchan's avatar
wxchan committed
23
def _load_lib():
24
    """Load LightGBM library."""
wxchan's avatar
wxchan committed
25
26
    lib_path = find_lib_path()
    if len(lib_path) == 0:
27
        return None
wxchan's avatar
wxchan committed
28
29
30
31
    lib = ctypes.cdll.LoadLibrary(lib_path[0])
    lib.LGBM_GetLastError.restype = ctypes.c_char_p
    return lib

wxchan's avatar
wxchan committed
32

wxchan's avatar
wxchan committed
33
34
_LIB = _load_lib()

wxchan's avatar
wxchan committed
35

wxchan's avatar
wxchan committed
36
def _safe_call(ret):
37
38
    """Check the return value from C API call.

wxchan's avatar
wxchan committed
39
40
41
    Parameters
    ----------
    ret : int
42
        The return value from C API calls.
wxchan's avatar
wxchan committed
43
44
    """
    if ret != 0:
45
        raise LightGBMError(decode_string(_LIB.LGBM_GetLastError()))
wxchan's avatar
wxchan committed
46

wxchan's avatar
wxchan committed
47

wxchan's avatar
wxchan committed
48
def is_numeric(obj):
49
    """Check whether object is a number or not, include numpy number, etc."""
wxchan's avatar
wxchan committed
50
51
52
    try:
        float(obj)
        return True
wxchan's avatar
wxchan committed
53
54
55
    except (TypeError, ValueError):
        # TypeError: obj is not a string or a number
        # ValueError: invalid literal
wxchan's avatar
wxchan committed
56
57
        return False

wxchan's avatar
wxchan committed
58

wxchan's avatar
wxchan committed
59
def is_numpy_1d_array(data):
60
    """Check whether data is a numpy 1-D array."""
61
    return isinstance(data, np.ndarray) and len(data.shape) == 1
wxchan's avatar
wxchan committed
62

wxchan's avatar
wxchan committed
63

wxchan's avatar
wxchan committed
64
def is_1d_list(data):
65
66
    """Check whether data is a 1-D list."""
    return isinstance(data, list) and (not data or is_numeric(data[0]))
wxchan's avatar
wxchan committed
67

wxchan's avatar
wxchan committed
68

69
def list_to_1d_numpy(data, dtype=np.float32, name='list'):
70
    """Convert data to numpy 1-D array."""
wxchan's avatar
wxchan committed
71
72
73
74
75
76
77
    if is_numpy_1d_array(data):
        if data.dtype == dtype:
            return data
        else:
            return data.astype(dtype=dtype, copy=False)
    elif is_1d_list(data):
        return np.array(data, dtype=dtype, copy=False)
78
    elif isinstance(data, Series):
79
80
        if _get_bad_pandas_dtypes([data.dtypes]):
            raise ValueError('Series.dtypes must be int, float or bool')
81
        return np.array(data, dtype=dtype, copy=False)  # SparseArray should be supported as well
wxchan's avatar
wxchan committed
82
    else:
83
84
        raise TypeError("Wrong type({0}) for {1}.\n"
                        "It should be list, numpy 1-D array or pandas Series".format(type(data).__name__, name))
wxchan's avatar
wxchan committed
85

wxchan's avatar
wxchan committed
86

wxchan's avatar
wxchan committed
87
def cfloat32_array_to_numpy(cptr, length):
88
    """Convert a ctypes float pointer array to a numpy array."""
wxchan's avatar
wxchan committed
89
    if isinstance(cptr, ctypes.POINTER(ctypes.c_float)):
90
        return np.fromiter(cptr, dtype=np.float32, count=length)
wxchan's avatar
wxchan committed
91
    else:
92
        raise RuntimeError('Expected float pointer')
wxchan's avatar
wxchan committed
93

Guolin Ke's avatar
Guolin Ke committed
94

Guolin Ke's avatar
Guolin Ke committed
95
def cfloat64_array_to_numpy(cptr, length):
96
    """Convert a ctypes double pointer array to a numpy array."""
Guolin Ke's avatar
Guolin Ke committed
97
98
99
100
101
    if isinstance(cptr, ctypes.POINTER(ctypes.c_double)):
        return np.fromiter(cptr, dtype=np.float64, count=length)
    else:
        raise RuntimeError('Expected double pointer')

wxchan's avatar
wxchan committed
102

wxchan's avatar
wxchan committed
103
def cint32_array_to_numpy(cptr, length):
104
    """Convert a ctypes int pointer array to a numpy array."""
wxchan's avatar
wxchan committed
105
    if isinstance(cptr, ctypes.POINTER(ctypes.c_int32)):
106
        return np.fromiter(cptr, dtype=np.int32, count=length)
wxchan's avatar
wxchan committed
107
    else:
108
        raise RuntimeError('Expected int pointer')
wxchan's avatar
wxchan committed
109

wxchan's avatar
wxchan committed
110

111
112
113
114
115
116
117
118
def cint8_array_to_numpy(cptr, length):
    """Convert a ctypes int pointer array to a numpy array."""
    if isinstance(cptr, ctypes.POINTER(ctypes.c_int8)):
        return np.fromiter(cptr, dtype=np.int8, count=length)
    else:
        raise RuntimeError('Expected int pointer')


wxchan's avatar
wxchan committed
119
def c_str(string):
120
    """Convert a Python string to C string."""
wxchan's avatar
wxchan committed
121
122
    return ctypes.c_char_p(string.encode('utf-8'))

wxchan's avatar
wxchan committed
123

wxchan's avatar
wxchan committed
124
def c_array(ctype, values):
125
    """Convert a Python array to C array."""
wxchan's avatar
wxchan committed
126
127
    return (ctype * len(values))(*values)

wxchan's avatar
wxchan committed
128

wxchan's avatar
wxchan committed
129
def param_dict_to_str(data):
130
    """Convert Python dictionary to string, which is passed to C API."""
131
    if data is None or not data:
wxchan's avatar
wxchan committed
132
133
134
        return ""
    pairs = []
    for key, val in data.items():
135
        if isinstance(val, (list, tuple, set)) or is_numpy_1d_array(val):
wxchan's avatar
wxchan committed
136
            pairs.append(str(key) + '=' + ','.join(map(str, val)))
wxchan's avatar
wxchan committed
137
        elif isinstance(val, string_type) or isinstance(val, numeric_types) or is_numeric(val):
wxchan's avatar
wxchan committed
138
            pairs.append(str(key) + '=' + str(val))
139
        elif val is not None:
140
            raise TypeError('Unknown type of parameter:%s, got:%s'
wxchan's avatar
wxchan committed
141
142
                            % (key, type(val).__name__))
    return ' '.join(pairs)
143

wxchan's avatar
wxchan committed
144

145
class _TempFile(object):
146
147
148
149
    def __enter__(self):
        with NamedTemporaryFile(prefix="lightgbm_tmp_", delete=True) as f:
            self.name = f.name
        return self
wxchan's avatar
wxchan committed
150

151
152
153
    def __exit__(self, exc_type, exc_val, exc_tb):
        if os.path.isfile(self.name):
            os.remove(self.name)
wxchan's avatar
wxchan committed
154

155
156
157
158
    def readlines(self):
        with open(self.name, "r+") as f:
            ret = f.readlines()
        return ret
wxchan's avatar
wxchan committed
159

160
161
    def writelines(self, lines):
        with open(self.name, "w+") as f:
162
            f.writelines(lines)
163

wxchan's avatar
wxchan committed
164

165
class LightGBMError(Exception):
166
167
    """Error thrown by LightGBM."""

168
169
170
    pass


171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
class _ConfigAliases(object):
    aliases = {"boosting": {"boosting",
                            "boosting_type",
                            "boost"},
               "categorical_feature": {"categorical_feature",
                                       "cat_feature",
                                       "categorical_column",
                                       "cat_column"},
               "early_stopping_round": {"early_stopping_round",
                                        "early_stopping_rounds",
                                        "early_stopping",
                                        "n_iter_no_change"},
               "eval_at": {"eval_at",
                           "ndcg_eval_at",
                           "ndcg_at",
                           "map_eval_at",
                           "map_at"},
               "header": {"header",
                          "has_header"},
               "machines": {"machines",
                            "workers",
                            "nodes"},
               "metric": {"metric",
                          "metrics",
                          "metric_types"},
               "num_class": {"num_class",
                             "num_classes"},
               "num_iterations": {"num_iterations",
                                  "num_iteration",
                                  "n_iter",
                                  "num_tree",
                                  "num_trees",
                                  "num_round",
                                  "num_rounds",
                                  "num_boost_round",
                                  "n_estimators"},
               "objective": {"objective",
                             "objective_type",
                             "app",
                             "application"},
               "verbosity": {"verbosity",
                             "verbose"}}

    @classmethod
    def get(cls, *args):
        ret = set()
        for i in args:
            ret |= cls.aliases.get(i, set())
        return ret


222
223
MAX_INT32 = (1 << 31) - 1

224
"""Macro definition of data type in C API of LightGBM"""
wxchan's avatar
wxchan committed
225
226
227
228
C_API_DTYPE_FLOAT32 = 0
C_API_DTYPE_FLOAT64 = 1
C_API_DTYPE_INT32 = 2
C_API_DTYPE_INT64 = 3
229
C_API_DTYPE_INT8 = 4
Guolin Ke's avatar
Guolin Ke committed
230

231
"""Matrix is row major in Python"""
wxchan's avatar
wxchan committed
232
233
C_API_IS_ROW_MAJOR = 1

234
"""Macro definition of prediction type in C API of LightGBM"""
wxchan's avatar
wxchan committed
235
236
237
C_API_PREDICT_NORMAL = 0
C_API_PREDICT_RAW_SCORE = 1
C_API_PREDICT_LEAF_INDEX = 2
238
C_API_PREDICT_CONTRIB = 3
wxchan's avatar
wxchan committed
239

240
"""Data type of data field"""
wxchan's avatar
wxchan committed
241
242
FIELD_TYPE_MAPPER = {"label": C_API_DTYPE_FLOAT32,
                     "weight": C_API_DTYPE_FLOAT32,
Guolin Ke's avatar
Guolin Ke committed
243
                     "init_score": C_API_DTYPE_FLOAT64,
244
245
246
                     "group": C_API_DTYPE_INT32,
                     "feature_penalty": C_API_DTYPE_FLOAT64,
                     "monotone_constraints": C_API_DTYPE_INT8}
wxchan's avatar
wxchan committed
247

wxchan's avatar
wxchan committed
248

249
def convert_from_sliced_object(data):
250
    """Fix the memory of multi-dimensional sliced object."""
251
    if isinstance(data, np.ndarray) and isinstance(data.base, np.ndarray):
252
        if not data.flags.c_contiguous:
253
254
            warnings.warn("Usage of np.ndarray subset (sliced data) is not recommended "
                          "due to it will double the peak memory cost in LightGBM.")
255
256
257
258
            return np.copy(data)
    return data


wxchan's avatar
wxchan committed
259
def c_float_array(data):
260
    """Get pointer of float numpy array / list."""
wxchan's avatar
wxchan committed
261
262
263
    if is_1d_list(data):
        data = np.array(data, copy=False)
    if is_numpy_1d_array(data):
264
265
        data = convert_from_sliced_object(data)
        assert data.flags.c_contiguous
wxchan's avatar
wxchan committed
266
267
268
269
270
271
272
        if data.dtype == np.float32:
            ptr_data = data.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
            type_data = C_API_DTYPE_FLOAT32
        elif data.dtype == np.float64:
            ptr_data = data.ctypes.data_as(ctypes.POINTER(ctypes.c_double))
            type_data = C_API_DTYPE_FLOAT64
        else:
273
            raise TypeError("Expected np.float32 or np.float64, met type({})"
wxchan's avatar
wxchan committed
274
275
                            .format(data.dtype))
    else:
276
        raise TypeError("Unknown type({})".format(type(data).__name__))
277
    return (ptr_data, type_data, data)  # return `data` to avoid the temporary copy is freed
wxchan's avatar
wxchan committed
278

wxchan's avatar
wxchan committed
279

wxchan's avatar
wxchan committed
280
def c_int_array(data):
281
    """Get pointer of int numpy array / list."""
wxchan's avatar
wxchan committed
282
283
284
    if is_1d_list(data):
        data = np.array(data, copy=False)
    if is_numpy_1d_array(data):
285
286
        data = convert_from_sliced_object(data)
        assert data.flags.c_contiguous
wxchan's avatar
wxchan committed
287
288
289
290
291
292
293
        if data.dtype == np.int32:
            ptr_data = data.ctypes.data_as(ctypes.POINTER(ctypes.c_int32))
            type_data = C_API_DTYPE_INT32
        elif data.dtype == np.int64:
            ptr_data = data.ctypes.data_as(ctypes.POINTER(ctypes.c_int64))
            type_data = C_API_DTYPE_INT64
        else:
294
            raise TypeError("Expected np.int32 or np.int64, met type({})"
wxchan's avatar
wxchan committed
295
296
                            .format(data.dtype))
    else:
297
        raise TypeError("Unknown type({})".format(type(data).__name__))
298
    return (ptr_data, type_data, data)  # return `data` to avoid the temporary copy is freed
wxchan's avatar
wxchan committed
299

wxchan's avatar
wxchan committed
300

301
302
303
304
305
306
307
308
309
310
311
def _get_bad_pandas_dtypes(dtypes):
    pandas_dtype_mapper = {'int8': 'int', 'int16': 'int', 'int32': 'int',
                           'int64': 'int', 'uint8': 'int', 'uint16': 'int',
                           'uint32': 'int', 'uint64': 'int', 'bool': 'int',
                           'float16': 'float', 'float32': 'float', 'float64': 'float'}
    bad_indices = [i for i, dtype in enumerate(dtypes) if (dtype.name not in pandas_dtype_mapper
                                                           and (not is_dtype_sparse(dtype)
                                                                or dtype.subtype.name not in pandas_dtype_mapper))]
    return bad_indices


312
def _data_from_pandas(data, feature_name, categorical_feature, pandas_categorical):
313
    if isinstance(data, DataFrame):
314
315
        if len(data.shape) != 2 or data.shape[0] < 1:
            raise ValueError('Input data must be 2 dimensional and non empty.')
316
317
        if feature_name == 'auto' or feature_name is None:
            data = data.rename(columns=str)
318
319
        cat_cols = list(data.select_dtypes(include=['category']).columns)
        cat_cols_not_ordered = [col for col in cat_cols if not data[col].cat.ordered]
320
321
322
323
324
        if pandas_categorical is None:  # train dataset
            pandas_categorical = [list(data[col].cat.categories) for col in cat_cols]
        else:
            if len(cat_cols) != len(pandas_categorical):
                raise ValueError('train and valid dataset categorical_feature do not match.')
325
            for col, category in zip_(cat_cols, pandas_categorical):
326
327
                if list(data[col].cat.categories) != list(category):
                    data[col] = data[col].cat.set_categories(category)
328
        if len(cat_cols):  # cat_cols is list
329
            data = data.copy()  # not alter origin DataFrame
330
            data[cat_cols] = data[cat_cols].apply(lambda x: x.cat.codes).replace({-1: np.nan})
331
332
333
        if categorical_feature is not None:
            if feature_name is None:
                feature_name = list(data.columns)
334
            if categorical_feature == 'auto':  # use cat cols from DataFrame
335
                categorical_feature = cat_cols_not_ordered
336
337
            else:  # use cat cols specified by user
                categorical_feature = list(categorical_feature)
338
339
        if feature_name == 'auto':
            feature_name = list(data.columns)
340
341
        bad_indices = _get_bad_pandas_dtypes(data.dtypes)
        if bad_indices:
342
            raise ValueError("DataFrame.dtypes for data must be int, float or bool.\n"
343
                             "Did not expect the data types in the following fields: "
344
                             + ', '.join(data.columns[bad_indices]))
345
346
347
        data = data.values
        if data.dtype != np.float32 and data.dtype != np.float64:
            data = data.astype(np.float32)
348
349
350
351
352
353
    else:
        if feature_name == 'auto':
            feature_name = None
        if categorical_feature == 'auto':
            categorical_feature = None
    return data, feature_name, categorical_feature, pandas_categorical
354
355
356
357
358
359


def _label_from_pandas(label):
    if isinstance(label, DataFrame):
        if len(label.columns) > 1:
            raise ValueError('DataFrame for label cannot have multiple columns')
360
        if _get_bad_pandas_dtypes(label.dtypes):
361
            raise ValueError('DataFrame.dtypes for label must be int, float or bool')
362
        label = np.ravel(label.values.astype(np.float32, copy=False))
363
364
365
    return label


366
367
368
369
370
371
372
373
374
375
376
def _dump_pandas_categorical(pandas_categorical, file_name=None):
    pandas_str = ('\npandas_categorical:'
                  + json.dumps(pandas_categorical, default=json_default_with_numpy)
                  + '\n')
    if file_name is not None:
        with open(file_name, 'a') as f:
            f.write(pandas_str)
    return pandas_str


def _load_pandas_categorical(file_name=None, model_str=None):
377
378
    pandas_key = 'pandas_categorical:'
    offset = -len(pandas_key)
379
    if file_name is not None:
380
381
382
383
384
385
386
387
388
389
390
391
392
        max_offset = -os.path.getsize(file_name)
        with open(file_name, 'rb') as f:
            while True:
                if offset < max_offset:
                    offset = max_offset
                f.seek(offset, os.SEEK_END)
                lines = f.readlines()
                if len(lines) >= 2:
                    break
                offset *= 2
        last_line = decode_string(lines[-1]).strip()
        if not last_line.startswith(pandas_key):
            last_line = decode_string(lines[-2]).strip()
393
    elif model_str is not None:
394
395
396
397
398
399
        idx = model_str.rfind('\n', 0, offset)
        last_line = model_str[idx:].strip()
    if last_line.startswith(pandas_key):
        return json.loads(last_line[len(pandas_key):])
    else:
        return None
400
401


Guolin Ke's avatar
Guolin Ke committed
402
class _InnerPredictor(object):
403
404
405
406
407
    """_InnerPredictor of LightGBM.

    Not exposed to user.
    Used only for prediction, usually used for continued training.

Nikita Titov's avatar
Nikita Titov committed
408
409
410
    .. note::

        Can be converted from Booster, but cannot be converted to Booster.
Guolin Ke's avatar
Guolin Ke committed
411
    """
412

413
    def __init__(self, model_file=None, booster_handle=None, pred_parameter=None):
414
        """Initialize the _InnerPredictor.
wxchan's avatar
wxchan committed
415
416
417

        Parameters
        ----------
418
        model_file : string or None, optional (default=None)
wxchan's avatar
wxchan committed
419
            Path to the model file.
420
421
422
423
        booster_handle : object or None, optional (default=None)
            Handle of Booster.
        pred_parameter: dict or None, optional (default=None)
            Other parameters for the prediciton.
wxchan's avatar
wxchan committed
424
425
426
427
428
        """
        self.handle = ctypes.c_void_p()
        self.__is_manage_handle = True
        if model_file is not None:
            """Prediction task"""
Guolin Ke's avatar
Guolin Ke committed
429
            out_num_iterations = ctypes.c_int(0)
wxchan's avatar
wxchan committed
430
431
432
433
            _safe_call(_LIB.LGBM_BoosterCreateFromModelfile(
                c_str(model_file),
                ctypes.byref(out_num_iterations),
                ctypes.byref(self.handle)))
Guolin Ke's avatar
Guolin Ke committed
434
            out_num_class = ctypes.c_int(0)
wxchan's avatar
wxchan committed
435
436
437
438
            _safe_call(_LIB.LGBM_BoosterGetNumClasses(
                self.handle,
                ctypes.byref(out_num_class)))
            self.num_class = out_num_class.value
439
            self.num_total_iteration = out_num_iterations.value
440
            self.pandas_categorical = _load_pandas_categorical(file_name=model_file)
wxchan's avatar
wxchan committed
441
        elif booster_handle is not None:
Guolin Ke's avatar
Guolin Ke committed
442
            self.__is_manage_handle = False
wxchan's avatar
wxchan committed
443
            self.handle = booster_handle
Guolin Ke's avatar
Guolin Ke committed
444
            out_num_class = ctypes.c_int(0)
wxchan's avatar
wxchan committed
445
446
447
448
            _safe_call(_LIB.LGBM_BoosterGetNumClasses(
                self.handle,
                ctypes.byref(out_num_class)))
            self.num_class = out_num_class.value
Guolin Ke's avatar
Guolin Ke committed
449
            out_num_iterations = ctypes.c_int(0)
wxchan's avatar
wxchan committed
450
451
452
            _safe_call(_LIB.LGBM_BoosterGetCurrentIteration(
                self.handle,
                ctypes.byref(out_num_iterations)))
453
            self.num_total_iteration = out_num_iterations.value
454
            self.pandas_categorical = None
wxchan's avatar
wxchan committed
455
        else:
456
            raise TypeError('Need model_file or booster_handle to create a predictor')
wxchan's avatar
wxchan committed
457

458
459
        pred_parameter = {} if pred_parameter is None else pred_parameter
        self.pred_parameter = param_dict_to_str(pred_parameter)
cbecker's avatar
cbecker committed
460

wxchan's avatar
wxchan committed
461
    def __del__(self):
462
463
464
465
466
        try:
            if self.__is_manage_handle:
                _safe_call(_LIB.LGBM_BoosterFree(self.handle))
        except AttributeError:
            pass
wxchan's avatar
wxchan committed
467

468
469
470
471
472
    def __getstate__(self):
        this = self.__dict__.copy()
        this.pop('handle', None)
        return this

wxchan's avatar
wxchan committed
473
    def predict(self, data, num_iteration=-1,
474
                raw_score=False, pred_leaf=False, pred_contrib=False, data_has_header=False,
wxchan's avatar
wxchan committed
475
                is_reshape=True):
476
        """Predict logic.
wxchan's avatar
wxchan committed
477
478
479

        Parameters
        ----------
480
        data : string, numpy array, pandas DataFrame, H2O DataTable's Frame or scipy.sparse
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
            Data source for prediction.
            When data type is string, it represents the path of txt file.
        num_iteration : int, optional (default=-1)
            Iteration used for prediction.
        raw_score : bool, optional (default=False)
            Whether to predict raw scores.
        pred_leaf : bool, optional (default=False)
            Whether to predict leaf index.
        pred_contrib : bool, optional (default=False)
            Whether to predict feature contributions.
        data_has_header : bool, optional (default=False)
            Whether data has header.
            Used only for txt data.
        is_reshape : bool, optional (default=True)
            Whether to reshape to (nrow, ncol).
wxchan's avatar
wxchan committed
496
497
498

        Returns
        -------
499
500
        result : numpy array
            Prediction result.
wxchan's avatar
wxchan committed
501
        """
wxchan's avatar
wxchan committed
502
        if isinstance(data, Dataset):
503
            raise TypeError("Cannot use Dataset instance for prediction, please use raw data instead")
504
        data = _data_from_pandas(data, None, None, self.pandas_categorical)[0]
wxchan's avatar
wxchan committed
505
506
507
508
509
        predict_type = C_API_PREDICT_NORMAL
        if raw_score:
            predict_type = C_API_PREDICT_RAW_SCORE
        if pred_leaf:
            predict_type = C_API_PREDICT_LEAF_INDEX
510
511
        if pred_contrib:
            predict_type = C_API_PREDICT_CONTRIB
wxchan's avatar
wxchan committed
512
        int_data_has_header = 1 if data_has_header else 0
513
514
        if num_iteration > self.num_total_iteration:
            num_iteration = self.num_total_iteration
cbecker's avatar
cbecker committed
515

wxchan's avatar
wxchan committed
516
        if isinstance(data, string_type):
517
            with _TempFile() as f:
wxchan's avatar
wxchan committed
518
519
520
                _safe_call(_LIB.LGBM_BoosterPredictForFile(
                    self.handle,
                    c_str(data),
Guolin Ke's avatar
Guolin Ke committed
521
522
523
                    ctypes.c_int(int_data_has_header),
                    ctypes.c_int(predict_type),
                    ctypes.c_int(num_iteration),
524
                    c_str(self.pred_parameter),
wxchan's avatar
wxchan committed
525
526
                    c_str(f.name)))
                lines = f.readlines()
527
528
                nrow = len(lines)
                preds = [float(token) for line in lines for token in line.split('\t')]
Guolin Ke's avatar
Guolin Ke committed
529
                preds = np.array(preds, dtype=np.float64, copy=False)
wxchan's avatar
wxchan committed
530
        elif isinstance(data, scipy.sparse.csr_matrix):
531
            preds, nrow = self.__pred_for_csr(data, num_iteration, predict_type)
Guolin Ke's avatar
Guolin Ke committed
532
        elif isinstance(data, scipy.sparse.csc_matrix):
533
            preds, nrow = self.__pred_for_csc(data, num_iteration, predict_type)
wxchan's avatar
wxchan committed
534
        elif isinstance(data, np.ndarray):
535
            preds, nrow = self.__pred_for_np2d(data, num_iteration, predict_type)
536
537
538
        elif isinstance(data, list):
            try:
                data = np.array(data)
539
            except BaseException:
540
                raise ValueError('Cannot convert data list to numpy array.')
541
            preds, nrow = self.__pred_for_np2d(data, num_iteration, predict_type)
542
543
        elif isinstance(data, DataTable):
            preds, nrow = self.__pred_for_np2d(data.to_numpy(), num_iteration, predict_type)
wxchan's avatar
wxchan committed
544
545
        else:
            try:
546
                warnings.warn('Converting data to scipy sparse matrix.')
wxchan's avatar
wxchan committed
547
                csr = scipy.sparse.csr_matrix(data)
548
            except BaseException:
549
                raise TypeError('Cannot predict data for type {}'.format(type(data).__name__))
550
            preds, nrow = self.__pred_for_csr(csr, num_iteration, predict_type)
wxchan's avatar
wxchan committed
551
552
        if pred_leaf:
            preds = preds.astype(np.int32)
553
        if is_reshape and preds.size != nrow:
wxchan's avatar
wxchan committed
554
            if preds.size % nrow == 0:
555
                preds = preds.reshape(nrow, -1)
wxchan's avatar
wxchan committed
556
            else:
557
                raise ValueError('Length of predict result (%d) cannot be divide nrow (%d)'
wxchan's avatar
wxchan committed
558
559
560
561
                                 % (preds.size, nrow))
        return preds

    def __get_num_preds(self, num_iteration, nrow, predict_type):
562
        """Get size of prediction result."""
563
564
565
566
567
        if nrow > MAX_INT32:
            raise LightGBMError('LightGBM cannot perform prediction for data'
                                'with number of rows greater than MAX_INT32 (%d).\n'
                                'You can split your data into chunks'
                                'and then concatenate predictions for them' % MAX_INT32)
Guolin Ke's avatar
Guolin Ke committed
568
569
570
        n_preds = ctypes.c_int64(0)
        _safe_call(_LIB.LGBM_BoosterCalcNumPredict(
            self.handle,
Guolin Ke's avatar
Guolin Ke committed
571
572
573
            ctypes.c_int(nrow),
            ctypes.c_int(predict_type),
            ctypes.c_int(num_iteration),
Guolin Ke's avatar
Guolin Ke committed
574
575
            ctypes.byref(n_preds)))
        return n_preds.value
wxchan's avatar
wxchan committed
576
577

    def __pred_for_np2d(self, mat, num_iteration, predict_type):
578
        """Predict for a 2-D numpy matrix."""
wxchan's avatar
wxchan committed
579
        if len(mat.shape) != 2:
580
            raise ValueError('Input numpy.ndarray or list must be 2 dimensional')
wxchan's avatar
wxchan committed
581

582
583
584
        def inner_predict(mat, num_iteration, predict_type, preds=None):
            if mat.dtype == np.float32 or mat.dtype == np.float64:
                data = np.array(mat.reshape(mat.size), dtype=mat.dtype, copy=False)
585
            else:  # change non-float data to float data, need to copy
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
                data = np.array(mat.reshape(mat.size), dtype=np.float32)
            ptr_data, type_ptr_data, _ = c_float_array(data)
            n_preds = self.__get_num_preds(num_iteration, mat.shape[0], predict_type)
            if preds is None:
                preds = np.zeros(n_preds, dtype=np.float64)
            elif len(preds.shape) != 1 or len(preds) != n_preds:
                raise ValueError("Wrong length of pre-allocated predict array")
            out_num_preds = ctypes.c_int64(0)
            _safe_call(_LIB.LGBM_BoosterPredictForMat(
                self.handle,
                ptr_data,
                ctypes.c_int(type_ptr_data),
                ctypes.c_int(mat.shape[0]),
                ctypes.c_int(mat.shape[1]),
                ctypes.c_int(C_API_IS_ROW_MAJOR),
                ctypes.c_int(predict_type),
                ctypes.c_int(num_iteration),
                c_str(self.pred_parameter),
                ctypes.byref(out_num_preds),
                preds.ctypes.data_as(ctypes.POINTER(ctypes.c_double))))
            if n_preds != out_num_preds.value:
                raise ValueError("Wrong length for predict results")
            return preds, mat.shape[0]

        nrow = mat.shape[0]
        if nrow > MAX_INT32:
            sections = np.arange(start=MAX_INT32, stop=nrow, step=MAX_INT32)
            # __get_num_preds() cannot work with nrow > MAX_INT32, so calculate overall number of predictions piecemeal
            n_preds = [self.__get_num_preds(num_iteration, i, predict_type) for i in np.diff([0] + list(sections) + [nrow])]
            n_preds_sections = np.array([0] + n_preds, dtype=np.intp).cumsum()
            preds = np.zeros(sum(n_preds), dtype=np.float64)
617
618
            for chunk, (start_idx_pred, end_idx_pred) in zip_(np.array_split(mat, sections),
                                                              zip_(n_preds_sections, n_preds_sections[1:])):
619
620
621
                # avoid memory consumption by arrays concatenation operations
                inner_predict(chunk, num_iteration, predict_type, preds[start_idx_pred:end_idx_pred])
            return preds, nrow
wxchan's avatar
wxchan committed
622
        else:
623
            return inner_predict(mat, num_iteration, predict_type)
wxchan's avatar
wxchan committed
624
625

    def __pred_for_csr(self, csr, num_iteration, predict_type):
626
        """Predict for a CSR data."""
627
628
629
630
631
632
633
634
635
636
637
638
        def inner_predict(csr, num_iteration, predict_type, preds=None):
            nrow = len(csr.indptr) - 1
            n_preds = self.__get_num_preds(num_iteration, nrow, predict_type)
            if preds is None:
                preds = np.zeros(n_preds, dtype=np.float64)
            elif len(preds.shape) != 1 or len(preds) != n_preds:
                raise ValueError("Wrong length of pre-allocated predict array")
            out_num_preds = ctypes.c_int64(0)

            ptr_indptr, type_ptr_indptr, __ = c_int_array(csr.indptr)
            ptr_data, type_ptr_data, _ = c_float_array(csr.data)

639
640
641
            assert csr.shape[1] <= MAX_INT32
            csr.indices = csr.indices.astype(np.int32, copy=False)

642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
            _safe_call(_LIB.LGBM_BoosterPredictForCSR(
                self.handle,
                ptr_indptr,
                ctypes.c_int32(type_ptr_indptr),
                csr.indices.ctypes.data_as(ctypes.POINTER(ctypes.c_int32)),
                ptr_data,
                ctypes.c_int(type_ptr_data),
                ctypes.c_int64(len(csr.indptr)),
                ctypes.c_int64(len(csr.data)),
                ctypes.c_int64(csr.shape[1]),
                ctypes.c_int(predict_type),
                ctypes.c_int(num_iteration),
                c_str(self.pred_parameter),
                ctypes.byref(out_num_preds),
                preds.ctypes.data_as(ctypes.POINTER(ctypes.c_double))))
            if n_preds != out_num_preds.value:
                raise ValueError("Wrong length for predict results")
            return preds, nrow
wxchan's avatar
wxchan committed
660

661
662
663
664
665
666
667
668
669
670
671
672
673
674
        nrow = len(csr.indptr) - 1
        if nrow > MAX_INT32:
            sections = [0] + list(np.arange(start=MAX_INT32, stop=nrow, step=MAX_INT32)) + [nrow]
            # __get_num_preds() cannot work with nrow > MAX_INT32, so calculate overall number of predictions piecemeal
            n_preds = [self.__get_num_preds(num_iteration, i, predict_type) for i in np.diff(sections)]
            n_preds_sections = np.array([0] + n_preds, dtype=np.intp).cumsum()
            preds = np.zeros(sum(n_preds), dtype=np.float64)
            for (start_idx, end_idx), (start_idx_pred, end_idx_pred) in zip_(zip_(sections, sections[1:]),
                                                                             zip_(n_preds_sections, n_preds_sections[1:])):
                # avoid memory consumption by arrays concatenation operations
                inner_predict(csr[start_idx:end_idx], num_iteration, predict_type, preds[start_idx_pred:end_idx_pred])
            return preds, nrow
        else:
            return inner_predict(csr, num_iteration, predict_type)
Guolin Ke's avatar
Guolin Ke committed
675
676

    def __pred_for_csc(self, csc, num_iteration, predict_type):
677
        """Predict for a CSC data."""
Guolin Ke's avatar
Guolin Ke committed
678
        nrow = csc.shape[0]
679
680
        if nrow > MAX_INT32:
            return self.__pred_for_csr(csc.tocsr(), num_iteration, predict_type)
Guolin Ke's avatar
Guolin Ke committed
681
682
683
684
        n_preds = self.__get_num_preds(num_iteration, nrow, predict_type)
        preds = np.zeros(n_preds, dtype=np.float64)
        out_num_preds = ctypes.c_int64(0)

685
686
        ptr_indptr, type_ptr_indptr, __ = c_int_array(csc.indptr)
        ptr_data, type_ptr_data, _ = c_float_array(csc.data)
Guolin Ke's avatar
Guolin Ke committed
687

688
689
690
        assert csc.shape[0] <= MAX_INT32
        csc.indices = csc.indices.astype(np.int32, copy=False)

Guolin Ke's avatar
Guolin Ke committed
691
692
693
        _safe_call(_LIB.LGBM_BoosterPredictForCSC(
            self.handle,
            ptr_indptr,
Guolin Ke's avatar
Guolin Ke committed
694
            ctypes.c_int32(type_ptr_indptr),
Guolin Ke's avatar
Guolin Ke committed
695
696
            csc.indices.ctypes.data_as(ctypes.POINTER(ctypes.c_int32)),
            ptr_data,
Guolin Ke's avatar
Guolin Ke committed
697
698
699
700
701
702
            ctypes.c_int(type_ptr_data),
            ctypes.c_int64(len(csc.indptr)),
            ctypes.c_int64(len(csc.data)),
            ctypes.c_int64(csc.shape[0]),
            ctypes.c_int(predict_type),
            ctypes.c_int(num_iteration),
703
            c_str(self.pred_parameter),
Guolin Ke's avatar
Guolin Ke committed
704
            ctypes.byref(out_num_preds),
wxchan's avatar
wxchan committed
705
            preds.ctypes.data_as(ctypes.POINTER(ctypes.c_double))))
wxchan's avatar
wxchan committed
706
        if n_preds != out_num_preds.value:
707
            raise ValueError("Wrong length for predict results")
wxchan's avatar
wxchan committed
708
709
        return preds, nrow

wxchan's avatar
wxchan committed
710

wxchan's avatar
wxchan committed
711
712
class Dataset(object):
    """Dataset in LightGBM."""
713

714
    def __init__(self, data, label=None, reference=None,
715
                 weight=None, group=None, init_score=None, silent=False,
716
                 feature_name='auto', categorical_feature='auto', params=None,
wxchan's avatar
wxchan committed
717
                 free_raw_data=True):
718
        """Initialize Dataset.
719

wxchan's avatar
wxchan committed
720
721
        Parameters
        ----------
722
        data : string, numpy array, pandas DataFrame, H2O DataTable's Frame, scipy.sparse or list of numpy arrays
wxchan's avatar
wxchan committed
723
            Data source of Dataset.
724
            If string, it represents the path to txt file.
725
        label : list, numpy 1-D array, pandas Series / one-column DataFrame or None, optional (default=None)
726
727
728
            Label of the data.
        reference : Dataset or None, optional (default=None)
            If this is Dataset for validation, training data should be used as reference.
729
        weight : list, numpy 1-D array, pandas Series or None, optional (default=None)
wxchan's avatar
wxchan committed
730
            Weight for each instance.
731
        group : list, numpy 1-D array, pandas Series or None, optional (default=None)
732
            Group/query size for Dataset.
733
        init_score : list, numpy 1-D array, pandas Series or None, optional (default=None)
734
            Init score for Dataset.
735
736
737
738
739
740
741
742
743
        silent : bool, optional (default=False)
            Whether to print messages during construction.
        feature_name : list of strings or 'auto', optional (default="auto")
            Feature names.
            If 'auto' and data is pandas DataFrame, data columns names are used.
        categorical_feature : list of strings or int, or 'auto', optional (default="auto")
            Categorical features.
            If list of int, interpreted as indices.
            If list of strings, interpreted as feature names (need to specify ``feature_name`` as well).
744
            If 'auto' and data is pandas DataFrame, pandas unordered categorical columns are used.
745
            All values in categorical features should be less than int32 max value (2147483647).
746
            Large values could be memory consuming. Consider using consecutive integers starting from zero.
747
            All negative values in categorical features will be treated as missing values.
748
            The output cannot be monotonically constrained with respect to a categorical feature.
Nikita Titov's avatar
Nikita Titov committed
749
        params : dict or None, optional (default=None)
750
            Other parameters for Dataset.
Nikita Titov's avatar
Nikita Titov committed
751
        free_raw_data : bool, optional (default=True)
752
            If True, raw data is freed after constructing inner Dataset.
wxchan's avatar
wxchan committed
753
        """
wxchan's avatar
wxchan committed
754
755
756
757
758
759
        self.handle = None
        self.data = data
        self.label = label
        self.reference = reference
        self.weight = weight
        self.group = group
760
        self.init_score = init_score
wxchan's avatar
wxchan committed
761
762
        self.silent = silent
        self.feature_name = feature_name
763
        self.categorical_feature = categorical_feature
764
        self.params = copy.deepcopy(params)
wxchan's avatar
wxchan committed
765
766
        self.free_raw_data = free_raw_data
        self.used_indices = None
767
        self.need_slice = True
wxchan's avatar
wxchan committed
768
        self._predictor = None
769
        self.pandas_categorical = None
770
        self.params_back_up = None
771
772
        self.feature_penalty = None
        self.monotone_constraints = None
wxchan's avatar
wxchan committed
773
774

    def __del__(self):
775
776
777
778
        try:
            self._free_handle()
        except AttributeError:
            pass
779
780

    def _free_handle(self):
781
        if self.handle is not None:
782
            _safe_call(_LIB.LGBM_DatasetFree(self.handle))
783
            self.handle = None
Guolin Ke's avatar
Guolin Ke committed
784
785
786
        self.need_slice = True
        if self.used_indices is not None:
            self.data = None
Nikita Titov's avatar
Nikita Titov committed
787
        return self
wxchan's avatar
wxchan committed
788

Guolin Ke's avatar
Guolin Ke committed
789
790
791
792
    def _set_init_score_by_predictor(self, predictor, data, used_indices=None):
        data_has_header = False
        if isinstance(data, string_type):
            # check data has header or not
793
            data_has_header = any(self.params.get(alias, False) for alias in _ConfigAliases.get("header"))
Guolin Ke's avatar
Guolin Ke committed
794
795
796
797
798
799
800
801
802
803
804
805
        init_score = predictor.predict(data,
                                       raw_score=True,
                                       data_has_header=data_has_header,
                                       is_reshape=False)
        num_data = self.num_data()
        if used_indices is not None:
            assert not self.need_slice
            if isinstance(data, string_type):
                sub_init_score = np.zeros(num_data * predictor.num_class, dtype=np.float32)
                assert num_data == len(used_indices)
                for i in range_(len(used_indices)):
                    for j in range_(predictor.num_class):
806
                        sub_init_score[i * predictor.num_class + j] = init_score[used_indices[i] * predictor.num_class + j]
Guolin Ke's avatar
Guolin Ke committed
807
808
809
810
811
812
813
814
815
816
                init_score = sub_init_score
        if predictor.num_class > 1:
            # need to regroup init_score
            new_init_score = np.zeros(init_score.size, dtype=np.float32)
            for i in range_(num_data):
                for j in range_(predictor.num_class):
                    new_init_score[j * num_data + i] = init_score[i * predictor.num_class + j]
            init_score = new_init_score
        self.set_init_score(init_score)

817
    def _lazy_init(self, data, label=None, reference=None,
818
                   weight=None, group=None, init_score=None, predictor=None,
wxchan's avatar
wxchan committed
819
                   silent=False, feature_name='auto',
820
                   categorical_feature='auto', params=None):
wxchan's avatar
wxchan committed
821
822
        if data is None:
            self.handle = None
Nikita Titov's avatar
Nikita Titov committed
823
            return self
Guolin Ke's avatar
Guolin Ke committed
824
825
826
        if reference is not None:
            self.pandas_categorical = reference.pandas_categorical
            categorical_feature = reference.categorical_feature
827
828
829
830
        data, feature_name, categorical_feature, self.pandas_categorical = _data_from_pandas(data,
                                                                                             feature_name,
                                                                                             categorical_feature,
                                                                                             self.pandas_categorical)
wxchan's avatar
wxchan committed
831
        label = _label_from_pandas(label)
Guolin Ke's avatar
Guolin Ke committed
832

833
        # process for args
wxchan's avatar
wxchan committed
834
        params = {} if params is None else params
835
836
837
        args_names = (getattr(self.__class__, '_lazy_init')
                      .__code__
                      .co_varnames[:getattr(self.__class__, '_lazy_init').__code__.co_argcount])
838
839
        for key, _ in params.items():
            if key in args_names:
840
841
842
                warnings.warn('{0} keyword has been found in `params` and will be ignored.\n'
                              'Please use {0} argument of the Dataset constructor to pass this parameter.'
                              .format(key))
843
        # user can set verbose with params, it has higher priority
844
        if not any(verbose_alias in params for verbose_alias in _ConfigAliases.get("verbosity")) and silent:
845
            params["verbose"] = -1
846
        # get categorical features
847
848
849
850
851
852
853
854
855
856
857
858
859
        if categorical_feature is not None:
            categorical_indices = set()
            feature_dict = {}
            if feature_name is not None:
                feature_dict = {name: i for i, name in enumerate(feature_name)}
            for name in categorical_feature:
                if isinstance(name, string_type) and name in feature_dict:
                    categorical_indices.add(feature_dict[name])
                elif isinstance(name, integer_types):
                    categorical_indices.add(name)
                else:
                    raise TypeError("Wrong type({}) or unknown name({}) in categorical_feature"
                                    .format(type(name).__name__, name))
860
            if categorical_indices:
861
862
863
864
                for cat_alias in _ConfigAliases.get("categorical_feature"):
                    if cat_alias in params:
                        warnings.warn('{} in param dict is overridden.'.format(cat_alias))
                        params.pop(cat_alias, None)
865
                params['categorical_column'] = sorted(categorical_indices)
866

wxchan's avatar
wxchan committed
867
        params_str = param_dict_to_str(params)
868
        # process for reference dataset
wxchan's avatar
wxchan committed
869
        ref_dataset = None
wxchan's avatar
wxchan committed
870
        if isinstance(reference, Dataset):
871
            ref_dataset = reference.construct().handle
wxchan's avatar
wxchan committed
872
873
        elif reference is not None:
            raise TypeError('Reference dataset should be None or dataset instance')
874
        # start construct data
wxchan's avatar
wxchan committed
875
        if isinstance(data, string_type):
wxchan's avatar
wxchan committed
876
877
878
879
880
881
882
883
            self.handle = ctypes.c_void_p()
            _safe_call(_LIB.LGBM_DatasetCreateFromFile(
                c_str(data),
                c_str(params_str),
                ref_dataset,
                ctypes.byref(self.handle)))
        elif isinstance(data, scipy.sparse.csr_matrix):
            self.__init_from_csr(data, params_str, ref_dataset)
Guolin Ke's avatar
Guolin Ke committed
884
885
        elif isinstance(data, scipy.sparse.csc_matrix):
            self.__init_from_csc(data, params_str, ref_dataset)
wxchan's avatar
wxchan committed
886
887
        elif isinstance(data, np.ndarray):
            self.__init_from_np2d(data, params_str, ref_dataset)
888
889
        elif isinstance(data, list) and len(data) > 0 and all(isinstance(x, np.ndarray) for x in data):
            self.__init_from_list_np2d(data, params_str, ref_dataset)
890
891
        elif isinstance(data, DataTable):
            self.__init_from_np2d(data.to_numpy(), params_str, ref_dataset)
wxchan's avatar
wxchan committed
892
893
894
895
        else:
            try:
                csr = scipy.sparse.csr_matrix(data)
                self.__init_from_csr(csr, params_str, ref_dataset)
896
            except BaseException:
wxchan's avatar
wxchan committed
897
                raise TypeError('Cannot initialize Dataset from {}'.format(type(data).__name__))
wxchan's avatar
wxchan committed
898
899
900
        if label is not None:
            self.set_label(label)
        if self.get_label() is None:
901
            raise ValueError("Label should not be None")
wxchan's avatar
wxchan committed
902
903
904
905
        if weight is not None:
            self.set_weight(weight)
        if group is not None:
            self.set_group(group)
906
907
908
        if isinstance(predictor, _InnerPredictor):
            if self._predictor is None and init_score is not None:
                warnings.warn("The init_score will be overridden by the prediction of init_model.")
Guolin Ke's avatar
Guolin Ke committed
909
            self._set_init_score_by_predictor(predictor, data)
910
911
        elif init_score is not None:
            self.set_init_score(init_score)
Guolin Ke's avatar
Guolin Ke committed
912
913
        elif predictor is not None:
            raise TypeError('Wrong predictor type {}'.format(type(predictor).__name__))
Guolin Ke's avatar
Guolin Ke committed
914
        # set feature names
Nikita Titov's avatar
Nikita Titov committed
915
        return self.set_feature_name(feature_name)
wxchan's avatar
wxchan committed
916
917

    def __init_from_np2d(self, mat, params_str, ref_dataset):
918
        """Initialize data from a 2-D numpy matrix."""
wxchan's avatar
wxchan committed
919
920
921
922
923
924
        if len(mat.shape) != 2:
            raise ValueError('Input numpy.ndarray must be 2 dimensional')

        self.handle = ctypes.c_void_p()
        if mat.dtype == np.float32 or mat.dtype == np.float64:
            data = np.array(mat.reshape(mat.size), dtype=mat.dtype, copy=False)
925
        else:  # change non-float data to float data, need to copy
wxchan's avatar
wxchan committed
926
927
            data = np.array(mat.reshape(mat.size), dtype=np.float32)

928
        ptr_data, type_ptr_data, _ = c_float_array(data)
wxchan's avatar
wxchan committed
929
930
        _safe_call(_LIB.LGBM_DatasetCreateFromMat(
            ptr_data,
Guolin Ke's avatar
Guolin Ke committed
931
932
933
934
            ctypes.c_int(type_ptr_data),
            ctypes.c_int(mat.shape[0]),
            ctypes.c_int(mat.shape[1]),
            ctypes.c_int(C_API_IS_ROW_MAJOR),
wxchan's avatar
wxchan committed
935
936
937
            c_str(params_str),
            ref_dataset,
            ctypes.byref(self.handle)))
Nikita Titov's avatar
Nikita Titov committed
938
        return self
wxchan's avatar
wxchan committed
939

940
    def __init_from_list_np2d(self, mats, params_str, ref_dataset):
941
        """Initialize data from a list of 2-D numpy matrices."""
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
        ncol = mats[0].shape[1]
        nrow = np.zeros((len(mats),), np.int32)
        if mats[0].dtype == np.float64:
            ptr_data = (ctypes.POINTER(ctypes.c_double) * len(mats))()
        else:
            ptr_data = (ctypes.POINTER(ctypes.c_float) * len(mats))()

        holders = []
        type_ptr_data = None

        for i, mat in enumerate(mats):
            if len(mat.shape) != 2:
                raise ValueError('Input numpy.ndarray must be 2 dimensional')

            if mat.shape[1] != ncol:
                raise ValueError('Input arrays must have same number of columns')

            nrow[i] = mat.shape[0]

            if mat.dtype == np.float32 or mat.dtype == np.float64:
                mats[i] = np.array(mat.reshape(mat.size), dtype=mat.dtype, copy=False)
963
            else:  # change non-float data to float data, need to copy
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
                mats[i] = np.array(mat.reshape(mat.size), dtype=np.float32)

            chunk_ptr_data, chunk_type_ptr_data, holder = c_float_array(mats[i])
            if type_ptr_data is not None and chunk_type_ptr_data != type_ptr_data:
                raise ValueError('Input chunks must have same type')
            ptr_data[i] = chunk_ptr_data
            type_ptr_data = chunk_type_ptr_data
            holders.append(holder)

        self.handle = ctypes.c_void_p()
        _safe_call(_LIB.LGBM_DatasetCreateFromMats(
            ctypes.c_int(len(mats)),
            ctypes.cast(ptr_data, ctypes.POINTER(ctypes.POINTER(ctypes.c_double))),
            ctypes.c_int(type_ptr_data),
            nrow.ctypes.data_as(ctypes.POINTER(ctypes.c_int32)),
            ctypes.c_int(ncol),
            ctypes.c_int(C_API_IS_ROW_MAJOR),
            c_str(params_str),
            ref_dataset,
            ctypes.byref(self.handle)))
Nikita Titov's avatar
Nikita Titov committed
984
        return self
985

wxchan's avatar
wxchan committed
986
    def __init_from_csr(self, csr, params_str, ref_dataset):
987
        """Initialize data from a CSR matrix."""
wxchan's avatar
wxchan committed
988
        if len(csr.indices) != len(csr.data):
989
            raise ValueError('Length mismatch: {} vs {}'.format(len(csr.indices), len(csr.data)))
wxchan's avatar
wxchan committed
990
991
        self.handle = ctypes.c_void_p()

992
993
        ptr_indptr, type_ptr_indptr, __ = c_int_array(csr.indptr)
        ptr_data, type_ptr_data, _ = c_float_array(csr.data)
wxchan's avatar
wxchan committed
994

995
996
997
        assert csr.shape[1] <= MAX_INT32
        csr.indices = csr.indices.astype(np.int32, copy=False)

wxchan's avatar
wxchan committed
998
999
        _safe_call(_LIB.LGBM_DatasetCreateFromCSR(
            ptr_indptr,
Guolin Ke's avatar
Guolin Ke committed
1000
            ctypes.c_int(type_ptr_indptr),
wxchan's avatar
wxchan committed
1001
1002
            csr.indices.ctypes.data_as(ctypes.POINTER(ctypes.c_int32)),
            ptr_data,
Guolin Ke's avatar
Guolin Ke committed
1003
1004
1005
1006
            ctypes.c_int(type_ptr_data),
            ctypes.c_int64(len(csr.indptr)),
            ctypes.c_int64(len(csr.data)),
            ctypes.c_int64(csr.shape[1]),
wxchan's avatar
wxchan committed
1007
1008
1009
            c_str(params_str),
            ref_dataset,
            ctypes.byref(self.handle)))
Nikita Titov's avatar
Nikita Titov committed
1010
        return self
wxchan's avatar
wxchan committed
1011

Guolin Ke's avatar
Guolin Ke committed
1012
    def __init_from_csc(self, csc, params_str, ref_dataset):
1013
        """Initialize data from a CSC matrix."""
Guolin Ke's avatar
Guolin Ke committed
1014
1015
1016
1017
        if len(csc.indices) != len(csc.data):
            raise ValueError('Length mismatch: {} vs {}'.format(len(csc.indices), len(csc.data)))
        self.handle = ctypes.c_void_p()

1018
1019
        ptr_indptr, type_ptr_indptr, __ = c_int_array(csc.indptr)
        ptr_data, type_ptr_data, _ = c_float_array(csc.data)
Guolin Ke's avatar
Guolin Ke committed
1020

1021
1022
1023
        assert csc.shape[0] <= MAX_INT32
        csc.indices = csc.indices.astype(np.int32, copy=False)

Guolin Ke's avatar
Guolin Ke committed
1024
1025
        _safe_call(_LIB.LGBM_DatasetCreateFromCSC(
            ptr_indptr,
Guolin Ke's avatar
Guolin Ke committed
1026
            ctypes.c_int(type_ptr_indptr),
Guolin Ke's avatar
Guolin Ke committed
1027
1028
            csc.indices.ctypes.data_as(ctypes.POINTER(ctypes.c_int32)),
            ptr_data,
Guolin Ke's avatar
Guolin Ke committed
1029
1030
1031
1032
            ctypes.c_int(type_ptr_data),
            ctypes.c_int64(len(csc.indptr)),
            ctypes.c_int64(len(csc.data)),
            ctypes.c_int64(csc.shape[0]),
Guolin Ke's avatar
Guolin Ke committed
1033
1034
1035
            c_str(params_str),
            ref_dataset,
            ctypes.byref(self.handle)))
Nikita Titov's avatar
Nikita Titov committed
1036
        return self
Guolin Ke's avatar
Guolin Ke committed
1037

wxchan's avatar
wxchan committed
1038
    def construct(self):
1039
1040
1041
1042
1043
        """Lazy init.

        Returns
        -------
        self : Dataset
Nikita Titov's avatar
Nikita Titov committed
1044
            Constructed Dataset object.
1045
        """
1046
        if self.handle is None:
wxchan's avatar
wxchan committed
1047
1048
            if self.reference is not None:
                if self.used_indices is None:
1049
                    # create valid
1050
                    self._lazy_init(self.data, label=self.label, reference=self.reference,
1051
1052
                                    weight=self.weight, group=self.group,
                                    init_score=self.init_score, predictor=self._predictor,
1053
                                    silent=self.silent, feature_name=self.feature_name, params=self.params)
wxchan's avatar
wxchan committed
1054
                else:
1055
                    # construct subset
wxchan's avatar
wxchan committed
1056
                    used_indices = list_to_1d_numpy(self.used_indices, np.int32, name='used_indices')
1057
                    assert used_indices.flags.c_contiguous
Guolin Ke's avatar
Guolin Ke committed
1058
                    if self.reference.group is not None:
1059
                        group_info = np.array(self.reference.group).astype(np.int32, copy=False)
1060
1061
                        _, self.group = np.unique(np.repeat(range_(len(group_info)), repeats=group_info)[self.used_indices],
                                                  return_counts=True)
1062
                    self.handle = ctypes.c_void_p()
wxchan's avatar
wxchan committed
1063
1064
                    params_str = param_dict_to_str(self.params)
                    _safe_call(_LIB.LGBM_DatasetGetSubset(
1065
                        self.reference.construct().handle,
wxchan's avatar
wxchan committed
1066
                        used_indices.ctypes.data_as(ctypes.POINTER(ctypes.c_int32)),
Guolin Ke's avatar
Guolin Ke committed
1067
                        ctypes.c_int(used_indices.shape[0]),
wxchan's avatar
wxchan committed
1068
1069
                        c_str(params_str),
                        ctypes.byref(self.handle)))
Guolin Ke's avatar
Guolin Ke committed
1070
1071
                    if not self.free_raw_data:
                        self.get_data()
Guolin Ke's avatar
Guolin Ke committed
1072
1073
                    if self.group is not None:
                        self.set_group(self.group)
wxchan's avatar
wxchan committed
1074
1075
                    if self.get_label() is None:
                        raise ValueError("Label should not be None.")
Guolin Ke's avatar
Guolin Ke committed
1076
1077
1078
                    if isinstance(self._predictor, _InnerPredictor) and self._predictor is not self.reference._predictor:
                        self.get_data()
                        self._set_init_score_by_predictor(self._predictor, self.data, used_indices)
wxchan's avatar
wxchan committed
1079
            else:
1080
                # create train
1081
                self._lazy_init(self.data, label=self.label,
1082
1083
1084
                                weight=self.weight, group=self.group,
                                init_score=self.init_score, predictor=self._predictor,
                                silent=self.silent, feature_name=self.feature_name,
1085
                                categorical_feature=self.categorical_feature, params=self.params)
wxchan's avatar
wxchan committed
1086
1087
1088
            if self.free_raw_data:
                self.data = None
        return self
wxchan's avatar
wxchan committed
1089

wxchan's avatar
wxchan committed
1090
    def create_valid(self, data, label=None, weight=None, group=None,
1091
                     init_score=None, silent=False, params=None):
1092
        """Create validation data align with current Dataset.
wxchan's avatar
wxchan committed
1093
1094
1095

        Parameters
        ----------
1096
        data : string, numpy array, pandas DataFrame, H2O DataTable's Frame, scipy.sparse or list of numpy arrays
wxchan's avatar
wxchan committed
1097
            Data source of Dataset.
1098
            If string, it represents the path to txt file.
1099
        label : list, numpy 1-D array, pandas Series / one-column DataFrame or None, optional (default=None)
1100
1101
            Label of the data.
        weight : list, numpy 1-D array, pandas Series or None, optional (default=None)
wxchan's avatar
wxchan committed
1102
            Weight for each instance.
1103
        group : list, numpy 1-D array, pandas Series or None, optional (default=None)
1104
            Group/query size for Dataset.
1105
        init_score : list, numpy 1-D array, pandas Series or None, optional (default=None)
1106
            Init score for Dataset.
1107
1108
        silent : bool, optional (default=False)
            Whether to print messages during construction.
Nikita Titov's avatar
Nikita Titov committed
1109
        params : dict or None, optional (default=None)
1110
            Other parameters for validation Dataset.
1111
1112
1113

        Returns
        -------
Nikita Titov's avatar
Nikita Titov committed
1114
1115
        valid : Dataset
            Validation Dataset with reference to self.
wxchan's avatar
wxchan committed
1116
        """
1117
        ret = Dataset(data, label=label, reference=self,
1118
1119
                      weight=weight, group=group, init_score=init_score,
                      silent=silent, params=params, free_raw_data=self.free_raw_data)
wxchan's avatar
wxchan committed
1120
        ret._predictor = self._predictor
1121
        ret.pandas_categorical = self.pandas_categorical
wxchan's avatar
wxchan committed
1122
        return ret
wxchan's avatar
wxchan committed
1123

wxchan's avatar
wxchan committed
1124
    def subset(self, used_indices, params=None):
1125
        """Get subset of current Dataset.
wxchan's avatar
wxchan committed
1126
1127
1128
1129

        Parameters
        ----------
        used_indices : list of int
1130
            Indices used to create the subset.
Nikita Titov's avatar
Nikita Titov committed
1131
        params : dict or None, optional (default=None)
1132
            These parameters will be passed to Dataset constructor.
1133
1134
1135
1136
1137

        Returns
        -------
        subset : Dataset
            Subset of the current Dataset.
wxchan's avatar
wxchan committed
1138
        """
wxchan's avatar
wxchan committed
1139
1140
        if params is None:
            params = self.params
wxchan's avatar
wxchan committed
1141
        ret = Dataset(None, reference=self, feature_name=self.feature_name,
1142
1143
                      categorical_feature=self.categorical_feature, params=params,
                      free_raw_data=self.free_raw_data)
wxchan's avatar
wxchan committed
1144
        ret._predictor = self._predictor
1145
        ret.pandas_categorical = self.pandas_categorical
1146
        ret.used_indices = sorted(used_indices)
wxchan's avatar
wxchan committed
1147
1148
1149
        return ret

    def save_binary(self, filename):
1150
        """Save Dataset to a binary file.
wxchan's avatar
wxchan committed
1151
1152
1153
1154
1155

        Parameters
        ----------
        filename : string
            Name of the output file.
Nikita Titov's avatar
Nikita Titov committed
1156
1157
1158
1159
1160

        Returns
        -------
        self : Dataset
            Returns self.
wxchan's avatar
wxchan committed
1161
1162
1163
1164
        """
        _safe_call(_LIB.LGBM_DatasetSaveBinary(
            self.construct().handle,
            c_str(filename)))
Nikita Titov's avatar
Nikita Titov committed
1165
        return self
wxchan's avatar
wxchan committed
1166
1167

    def _update_params(self, params):
1168
1169
        if self.handle is not None and params is not None:
            _safe_call(_LIB.LGBM_DatasetUpdateParam(self.handle, c_str(param_dict_to_str(params))))
wxchan's avatar
wxchan committed
1170
        if not self.params:
1171
            self.params = copy.deepcopy(params)
wxchan's avatar
wxchan committed
1172
        else:
1173
            self.params_back_up = copy.deepcopy(self.params)
wxchan's avatar
wxchan committed
1174
            self.params.update(params)
Nikita Titov's avatar
Nikita Titov committed
1175
        return self
wxchan's avatar
wxchan committed
1176

1177
1178
1179
    def _reverse_update_params(self):
        self.params = copy.deepcopy(self.params_back_up)
        self.params_back_up = None
1180
1181
        if self.handle is not None and self.params is not None:
            _safe_call(_LIB.LGBM_DatasetUpdateParam(self.handle, c_str(param_dict_to_str(self.params))))
Nikita Titov's avatar
Nikita Titov committed
1182
        return self
1183

wxchan's avatar
wxchan committed
1184
    def set_field(self, field_name, data):
wxchan's avatar
wxchan committed
1185
        """Set property into the Dataset.
wxchan's avatar
wxchan committed
1186
1187
1188

        Parameters
        ----------
Nikita Titov's avatar
Nikita Titov committed
1189
        field_name : string
1190
            The field name of the information.
1191
        data : list, numpy 1-D array, pandas Series or None
1192
            The array of data to be set.
Nikita Titov's avatar
Nikita Titov committed
1193
1194
1195
1196
1197

        Returns
        -------
        self : Dataset
            Dataset with set property.
wxchan's avatar
wxchan committed
1198
        """
1199
1200
        if self.handle is None:
            raise Exception("Cannot set %s before construct dataset" % field_name)
wxchan's avatar
wxchan committed
1201
        if data is None:
1202
            # set to None
wxchan's avatar
wxchan committed
1203
1204
1205
1206
            _safe_call(_LIB.LGBM_DatasetSetField(
                self.handle,
                c_str(field_name),
                None,
Guolin Ke's avatar
Guolin Ke committed
1207
1208
                ctypes.c_int(0),
                ctypes.c_int(FIELD_TYPE_MAPPER[field_name])))
Nikita Titov's avatar
Nikita Titov committed
1209
            return self
Guolin Ke's avatar
Guolin Ke committed
1210
1211
1212
1213
1214
        dtype = np.float32
        if field_name == 'group':
            dtype = np.int32
        elif field_name == 'init_score':
            dtype = np.float64
1215
        data = list_to_1d_numpy(data, dtype, name=field_name)
1216
1217
        if data.dtype == np.float32 or data.dtype == np.float64:
            ptr_data, type_data, _ = c_float_array(data)
wxchan's avatar
wxchan committed
1218
        elif data.dtype == np.int32:
1219
            ptr_data, type_data, _ = c_int_array(data)
wxchan's avatar
wxchan committed
1220
        else:
Nikita Titov's avatar
Nikita Titov committed
1221
            raise TypeError("Expected np.float32/64 or np.int32, met type({})".format(data.dtype))
wxchan's avatar
wxchan committed
1222
        if type_data != FIELD_TYPE_MAPPER[field_name]:
1223
            raise TypeError("Input type error for set_field")
wxchan's avatar
wxchan committed
1224
1225
1226
1227
        _safe_call(_LIB.LGBM_DatasetSetField(
            self.handle,
            c_str(field_name),
            ptr_data,
Guolin Ke's avatar
Guolin Ke committed
1228
1229
            ctypes.c_int(len(data)),
            ctypes.c_int(type_data)))
Nikita Titov's avatar
Nikita Titov committed
1230
        return self
wxchan's avatar
wxchan committed
1231

wxchan's avatar
wxchan committed
1232
1233
    def get_field(self, field_name):
        """Get property from the Dataset.
wxchan's avatar
wxchan committed
1234
1235
1236

        Parameters
        ----------
Nikita Titov's avatar
Nikita Titov committed
1237
        field_name : string
1238
            The field name of the information.
wxchan's avatar
wxchan committed
1239
1240
1241

        Returns
        -------
1242
1243
        info : numpy array
            A numpy array with information from the Dataset.
Guolin Ke's avatar
Guolin Ke committed
1244
        """
1245
        if self.handle is None:
1246
            raise Exception("Cannot get %s before construct Dataset" % field_name)
Guolin Ke's avatar
Guolin Ke committed
1247
1248
        tmp_out_len = ctypes.c_int()
        out_type = ctypes.c_int()
wxchan's avatar
wxchan committed
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
        ret = ctypes.POINTER(ctypes.c_void_p)()
        _safe_call(_LIB.LGBM_DatasetGetField(
            self.handle,
            c_str(field_name),
            ctypes.byref(tmp_out_len),
            ctypes.byref(ret),
            ctypes.byref(out_type)))
        if out_type.value != FIELD_TYPE_MAPPER[field_name]:
            raise TypeError("Return type error for get_field")
        if tmp_out_len.value == 0:
            return None
        if out_type.value == C_API_DTYPE_INT32:
            return cint32_array_to_numpy(ctypes.cast(ret, ctypes.POINTER(ctypes.c_int32)), tmp_out_len.value)
        elif out_type.value == C_API_DTYPE_FLOAT32:
            return cfloat32_array_to_numpy(ctypes.cast(ret, ctypes.POINTER(ctypes.c_float)), tmp_out_len.value)
Guolin Ke's avatar
Guolin Ke committed
1264
1265
        elif out_type.value == C_API_DTYPE_FLOAT64:
            return cfloat64_array_to_numpy(ctypes.cast(ret, ctypes.POINTER(ctypes.c_double)), tmp_out_len.value)
1266
1267
        elif out_type.value == C_API_DTYPE_INT8:
            return cint8_array_to_numpy(ctypes.cast(ret, ctypes.POINTER(ctypes.c_int8)), tmp_out_len.value)
1268
        else:
wxchan's avatar
wxchan committed
1269
            raise TypeError("Unknown type")
Guolin Ke's avatar
Guolin Ke committed
1270

1271
    def set_categorical_feature(self, categorical_feature):
1272
        """Set categorical features.
1273
1274
1275

        Parameters
        ----------
1276
1277
        categorical_feature : list of int or strings
            Names or indices of categorical features.
Nikita Titov's avatar
Nikita Titov committed
1278
1279
1280
1281
1282

        Returns
        -------
        self : Dataset
            Dataset with set categorical features.
1283
1284
        """
        if self.categorical_feature == categorical_feature:
Nikita Titov's avatar
Nikita Titov committed
1285
            return self
1286
        if self.data is not None:
1287
1288
            if self.categorical_feature is None:
                self.categorical_feature = categorical_feature
Nikita Titov's avatar
Nikita Titov committed
1289
                return self._free_handle()
1290
1291
            elif categorical_feature == 'auto':
                warnings.warn('Using categorical_feature in Dataset.')
Nikita Titov's avatar
Nikita Titov committed
1292
                return self
1293
            else:
1294
1295
                warnings.warn('categorical_feature in Dataset is overridden.\n'
                              'New categorical_feature is {}'.format(sorted(list(categorical_feature))))
1296
                self.categorical_feature = categorical_feature
Nikita Titov's avatar
Nikita Titov committed
1297
                return self._free_handle()
1298
        else:
1299
1300
            raise LightGBMError("Cannot set categorical feature after freed raw data, "
                                "set free_raw_data=False when construct Dataset to avoid this.")
1301

Guolin Ke's avatar
Guolin Ke committed
1302
    def _set_predictor(self, predictor):
1303
1304
1305
1306
        """Set predictor for continued training.

        It is not recommended for user to call this function.
        Please use init_model argument in engine.train() or engine.cv() instead.
Guolin Ke's avatar
Guolin Ke committed
1307
1308
        """
        if predictor is self._predictor:
Nikita Titov's avatar
Nikita Titov committed
1309
            return self
1310
1311
1312
        if self.data is not None or (self.used_indices is not None
                                     and self.reference is not None
                                     and self.reference.data is not None):
Guolin Ke's avatar
Guolin Ke committed
1313
            self._predictor = predictor
Nikita Titov's avatar
Nikita Titov committed
1314
            return self._free_handle()
Guolin Ke's avatar
Guolin Ke committed
1315
        else:
1316
1317
            raise LightGBMError("Cannot set predictor after freed raw data, "
                                "set free_raw_data=False when construct Dataset to avoid this.")
Guolin Ke's avatar
Guolin Ke committed
1318
1319

    def set_reference(self, reference):
1320
        """Set reference Dataset.
Guolin Ke's avatar
Guolin Ke committed
1321
1322
1323
1324

        Parameters
        ----------
        reference : Dataset
1325
            Reference that is used as a template to construct the current Dataset.
Nikita Titov's avatar
Nikita Titov committed
1326
1327
1328
1329
1330

        Returns
        -------
        self : Dataset
            Dataset with set reference.
Guolin Ke's avatar
Guolin Ke committed
1331
        """
1332
1333
1334
        self.set_categorical_feature(reference.categorical_feature) \
            .set_feature_name(reference.feature_name) \
            ._set_predictor(reference._predictor)
1335
1336
        # we're done if self and reference share a common upstrem reference
        if self.get_ref_chain().intersection(reference.get_ref_chain()):
Nikita Titov's avatar
Nikita Titov committed
1337
            return self
Guolin Ke's avatar
Guolin Ke committed
1338
1339
        if self.data is not None:
            self.reference = reference
Nikita Titov's avatar
Nikita Titov committed
1340
            return self._free_handle()
Guolin Ke's avatar
Guolin Ke committed
1341
        else:
1342
1343
            raise LightGBMError("Cannot set reference after freed raw data, "
                                "set free_raw_data=False when construct Dataset to avoid this.")
Guolin Ke's avatar
Guolin Ke committed
1344
1345

    def set_feature_name(self, feature_name):
1346
        """Set feature name.
Guolin Ke's avatar
Guolin Ke committed
1347
1348
1349

        Parameters
        ----------
1350
1351
        feature_name : list of strings
            Feature names.
Nikita Titov's avatar
Nikita Titov committed
1352
1353
1354
1355
1356

        Returns
        -------
        self : Dataset
            Dataset with set feature name.
Guolin Ke's avatar
Guolin Ke committed
1357
        """
1358
1359
        if feature_name != 'auto':
            self.feature_name = feature_name
1360
        if self.handle is not None and feature_name is not None and feature_name != 'auto':
wxchan's avatar
wxchan committed
1361
            if len(feature_name) != self.num_feature():
1362
1363
                raise ValueError("Length of feature_name({}) and num_feature({}) don't match"
                                 .format(len(feature_name), self.num_feature()))
1364
            c_feature_name = [c_str(name) for name in feature_name]
wxchan's avatar
wxchan committed
1365
1366
1367
            _safe_call(_LIB.LGBM_DatasetSetFeatureNames(
                self.handle,
                c_array(ctypes.c_char_p, c_feature_name),
Guolin Ke's avatar
Guolin Ke committed
1368
                ctypes.c_int(len(feature_name))))
Nikita Titov's avatar
Nikita Titov committed
1369
        return self
Guolin Ke's avatar
Guolin Ke committed
1370
1371

    def set_label(self, label):
1372
        """Set label of Dataset.
Guolin Ke's avatar
Guolin Ke committed
1373
1374
1375

        Parameters
        ----------
1376
        label : list, numpy 1-D array, pandas Series / one-column DataFrame or None
1377
            The label information to be set into Dataset.
Nikita Titov's avatar
Nikita Titov committed
1378
1379
1380
1381
1382

        Returns
        -------
        self : Dataset
            Dataset with set label.
Guolin Ke's avatar
Guolin Ke committed
1383
1384
        """
        self.label = label
1385
        if self.handle is not None:
1386
            label = list_to_1d_numpy(_label_from_pandas(label), name='label')
wxchan's avatar
wxchan committed
1387
            self.set_field('label', label)
1388
            self.label = self.get_field('label')  # original values can be modified at cpp side
Nikita Titov's avatar
Nikita Titov committed
1389
        return self
Guolin Ke's avatar
Guolin Ke committed
1390
1391

    def set_weight(self, weight):
1392
        """Set weight of each instance.
Guolin Ke's avatar
Guolin Ke committed
1393
1394
1395

        Parameters
        ----------
1396
        weight : list, numpy 1-D array, pandas Series or None
1397
            Weight to be set for each data point.
Nikita Titov's avatar
Nikita Titov committed
1398
1399
1400
1401
1402

        Returns
        -------
        self : Dataset
            Dataset with set weight.
Guolin Ke's avatar
Guolin Ke committed
1403
        """
1404
1405
        if weight is not None and np.all(weight == 1):
            weight = None
Guolin Ke's avatar
Guolin Ke committed
1406
        self.weight = weight
1407
        if self.handle is not None and weight is not None:
wxchan's avatar
wxchan committed
1408
1409
            weight = list_to_1d_numpy(weight, name='weight')
            self.set_field('weight', weight)
1410
            self.weight = self.get_field('weight')  # original values can be modified at cpp side
Nikita Titov's avatar
Nikita Titov committed
1411
        return self
Guolin Ke's avatar
Guolin Ke committed
1412
1413

    def set_init_score(self, init_score):
1414
        """Set init score of Booster to start from.
Guolin Ke's avatar
Guolin Ke committed
1415
1416
1417

        Parameters
        ----------
1418
        init_score : list, numpy 1-D array, pandas Series or None
1419
            Init score for Booster.
Nikita Titov's avatar
Nikita Titov committed
1420
1421
1422
1423
1424

        Returns
        -------
        self : Dataset
            Dataset with set init score.
Guolin Ke's avatar
Guolin Ke committed
1425
1426
        """
        self.init_score = init_score
1427
        if self.handle is not None and init_score is not None:
Guolin Ke's avatar
Guolin Ke committed
1428
            init_score = list_to_1d_numpy(init_score, np.float64, name='init_score')
wxchan's avatar
wxchan committed
1429
            self.set_field('init_score', init_score)
1430
            self.init_score = self.get_field('init_score')  # original values can be modified at cpp side
Nikita Titov's avatar
Nikita Titov committed
1431
        return self
Guolin Ke's avatar
Guolin Ke committed
1432
1433

    def set_group(self, group):
1434
        """Set group size of Dataset (used for ranking).
Guolin Ke's avatar
Guolin Ke committed
1435
1436
1437

        Parameters
        ----------
1438
        group : list, numpy 1-D array, pandas Series or None
1439
            Group size of each group.
Nikita Titov's avatar
Nikita Titov committed
1440
1441
1442
1443
1444

        Returns
        -------
        self : Dataset
            Dataset with set group.
Guolin Ke's avatar
Guolin Ke committed
1445
1446
        """
        self.group = group
1447
        if self.handle is not None and group is not None:
wxchan's avatar
wxchan committed
1448
1449
            group = list_to_1d_numpy(group, np.int32, name='group')
            self.set_field('group', group)
Nikita Titov's avatar
Nikita Titov committed
1450
        return self
Guolin Ke's avatar
Guolin Ke committed
1451
1452

    def get_label(self):
1453
        """Get the label of the Dataset.
Guolin Ke's avatar
Guolin Ke committed
1454
1455
1456

        Returns
        -------
Nikita Titov's avatar
Nikita Titov committed
1457
        label : numpy array or None
1458
            The label information from the Dataset.
Guolin Ke's avatar
Guolin Ke committed
1459
        """
1460
        if self.label is None:
wxchan's avatar
wxchan committed
1461
            self.label = self.get_field('label')
Guolin Ke's avatar
Guolin Ke committed
1462
1463
1464
        return self.label

    def get_weight(self):
1465
        """Get the weight of the Dataset.
Guolin Ke's avatar
Guolin Ke committed
1466
1467
1468

        Returns
        -------
Nikita Titov's avatar
Nikita Titov committed
1469
        weight : numpy array or None
1470
            Weight for each data point from the Dataset.
Guolin Ke's avatar
Guolin Ke committed
1471
        """
1472
        if self.weight is None:
wxchan's avatar
wxchan committed
1473
            self.weight = self.get_field('weight')
Guolin Ke's avatar
Guolin Ke committed
1474
1475
        return self.weight

1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
    def get_feature_penalty(self):
        """Get the feature penalty of the Dataset.

        Returns
        -------
        feature_penalty : numpy array or None
            Feature penalty for each feature in the Dataset.
        """
        if self.feature_penalty is None:
            self.feature_penalty = self.get_field('feature_penalty')
        return self.feature_penalty

    def get_monotone_constraints(self):
        """Get the monotone constraints of the Dataset.

        Returns
        -------
        monotone_constraints : numpy array or None
            Monotone constraints: -1, 0 or 1, for each feature in the Dataset.
        """
        if self.monotone_constraints is None:
            self.monotone_constraints = self.get_field('monotone_constraints')
        return self.monotone_constraints

Guolin Ke's avatar
Guolin Ke committed
1500
    def get_init_score(self):
1501
        """Get the initial score of the Dataset.
Guolin Ke's avatar
Guolin Ke committed
1502
1503
1504

        Returns
        -------
Nikita Titov's avatar
Nikita Titov committed
1505
        init_score : numpy array or None
1506
            Init score of Booster.
Guolin Ke's avatar
Guolin Ke committed
1507
        """
1508
        if self.init_score is None:
wxchan's avatar
wxchan committed
1509
            self.init_score = self.get_field('init_score')
Guolin Ke's avatar
Guolin Ke committed
1510
1511
        return self.init_score

1512
1513
1514
1515
1516
    def get_data(self):
        """Get the raw data of the Dataset.

        Returns
        -------
1517
        data : string, numpy array, pandas DataFrame, H2O DataTable's Frame, scipy.sparse, list of numpy arrays or None
1518
1519
1520
1521
            Raw data used in the Dataset construction.
        """
        if self.handle is None:
            raise Exception("Cannot get data before construct Dataset")
Guolin Ke's avatar
Guolin Ke committed
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
        if self.need_slice and self.used_indices is not None and self.reference is not None:
            self.data = self.reference.data
            if self.data is not None:
                if isinstance(self.data, np.ndarray) or scipy.sparse.issparse(self.data):
                    self.data = self.data[self.used_indices, :]
                elif isinstance(self.data, DataFrame):
                    self.data = self.data.iloc[self.used_indices].copy()
                elif isinstance(self.data, DataTable):
                    self.data = self.data[self.used_indices, :]
                else:
                    warnings.warn("Cannot subset {} type of raw data.\n"
                                  "Returning original raw data".format(type(self.data).__name__))
1534
            self.need_slice = False
Guolin Ke's avatar
Guolin Ke committed
1535
1536
1537
        if self.data is None:
            raise LightGBMError("Cannot call `get_data` after freed raw data, "
                                "set free_raw_data=False when construct Dataset to avoid this.")
1538
1539
        return self.data

Guolin Ke's avatar
Guolin Ke committed
1540
    def get_group(self):
1541
        """Get the group of the Dataset.
Guolin Ke's avatar
Guolin Ke committed
1542
1543
1544

        Returns
        -------
Nikita Titov's avatar
Nikita Titov committed
1545
        group : numpy array or None
1546
            Group size of each group.
Guolin Ke's avatar
Guolin Ke committed
1547
        """
1548
        if self.group is None:
wxchan's avatar
wxchan committed
1549
            self.group = self.get_field('group')
Guolin Ke's avatar
Guolin Ke committed
1550
1551
            if self.group is not None:
                # group data from LightGBM is boundaries data, need to convert to group size
Nikita Titov's avatar
Nikita Titov committed
1552
                self.group = np.diff(self.group)
Guolin Ke's avatar
Guolin Ke committed
1553
1554
1555
        return self.group

    def num_data(self):
1556
        """Get the number of rows in the Dataset.
Guolin Ke's avatar
Guolin Ke committed
1557
1558
1559

        Returns
        -------
1560
1561
        number_of_rows : int
            The number of rows in the Dataset.
Guolin Ke's avatar
Guolin Ke committed
1562
        """
1563
        if self.handle is not None:
Guolin Ke's avatar
Guolin Ke committed
1564
            ret = ctypes.c_int()
wxchan's avatar
wxchan committed
1565
1566
1567
            _safe_call(_LIB.LGBM_DatasetGetNumData(self.handle,
                                                   ctypes.byref(ret)))
            return ret.value
Guolin Ke's avatar
Guolin Ke committed
1568
        else:
1569
            raise LightGBMError("Cannot get num_data before construct dataset")
Guolin Ke's avatar
Guolin Ke committed
1570
1571

    def num_feature(self):
1572
        """Get the number of columns (features) in the Dataset.
Guolin Ke's avatar
Guolin Ke committed
1573
1574
1575

        Returns
        -------
1576
1577
        number_of_columns : int
            The number of columns (features) in the Dataset.
Guolin Ke's avatar
Guolin Ke committed
1578
        """
1579
        if self.handle is not None:
Guolin Ke's avatar
Guolin Ke committed
1580
            ret = ctypes.c_int()
wxchan's avatar
wxchan committed
1581
1582
1583
            _safe_call(_LIB.LGBM_DatasetGetNumFeature(self.handle,
                                                      ctypes.byref(ret)))
            return ret.value
Guolin Ke's avatar
Guolin Ke committed
1584
        else:
1585
            raise LightGBMError("Cannot get num_feature before construct dataset")
Guolin Ke's avatar
Guolin Ke committed
1586

1587
    def get_ref_chain(self, ref_limit=100):
1588
1589
1590
1591
1592
        """Get a chain of Dataset objects.

        Starts with r, then goes to r.reference (if exists),
        then to r.reference.reference, etc.
        until we hit ``ref_limit`` or a reference loop.
1593
1594
1595
1596
1597

        Parameters
        ----------
        ref_limit : int, optional (default=100)
            The limit number of references.
1598
1599
1600

        Returns
        -------
1601
1602
1603
        ref_chain : set of Dataset
            Chain of references of the Datasets.
        """
1604
        head = self
1605
        ref_chain = set()
1606
1607
        while len(ref_chain) < ref_limit:
            if isinstance(head, Dataset):
1608
                ref_chain.add(head)
1609
1610
1611
1612
1613
1614
                if (head.reference is not None) and (head.reference not in ref_chain):
                    head = head.reference
                else:
                    break
            else:
                break
Nikita Titov's avatar
Nikita Titov committed
1615
        return ref_chain
1616

1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
    def add_features_from(self, other):
        """Add features from other Dataset to the current Dataset.

        Both Datasets must be constructed before calling this method.

        Parameters
        ----------
        other : Dataset
            The Dataset to take features from.

        Returns
        -------
        self : Dataset
            Dataset with the new features added.
        """
        if self.handle is None or other.handle is None:
            raise ValueError('Both source and target Datasets must be constructed before adding features')
        _safe_call(_LIB.LGBM_DatasetAddFeaturesFrom(self.handle, other.handle))
        return self

1637
    def _dump_text(self, filename):
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
        """Save Dataset to a text file.

        This format cannot be loaded back in by LightGBM, but is useful for debugging purposes.

        Parameters
        ----------
        filename : string
            Name of the output file.

        Returns
        -------
        self : Dataset
            Returns self.
        """
        _safe_call(_LIB.LGBM_DatasetDumpText(
            self.construct().handle,
            c_str(filename)))
        return self

wxchan's avatar
wxchan committed
1657

wxchan's avatar
wxchan committed
1658
class Booster(object):
1659
    """Booster in LightGBM."""
1660

1661
    def __init__(self, params=None, train_set=None, model_file=None, model_str=None, silent=False):
1662
        """Initialize the Booster.
wxchan's avatar
wxchan committed
1663
1664
1665

        Parameters
        ----------
Nikita Titov's avatar
Nikita Titov committed
1666
        params : dict or None, optional (default=None)
1667
1668
1669
1670
            Parameters for Booster.
        train_set : Dataset or None, optional (default=None)
            Training dataset.
        model_file : string or None, optional (default=None)
wxchan's avatar
wxchan committed
1671
            Path to the model file.
1672
1673
        model_str : string or None, optional (default=None)
            Model will be loaded from this string.
1674
1675
        silent : bool, optional (default=False)
            Whether to print messages during construction.
wxchan's avatar
wxchan committed
1676
        """
1677
        self.handle = None
1678
        self.network = False
wxchan's avatar
wxchan committed
1679
        self.__need_reload_eval_info = True
1680
        self._train_data_name = "training"
wxchan's avatar
wxchan committed
1681
        self.__attr = {}
1682
        self.__set_objective_to_none = False
wxchan's avatar
wxchan committed
1683
        self.best_iteration = -1
wxchan's avatar
wxchan committed
1684
        self.best_score = {}
1685
        params = {} if params is None else copy.deepcopy(params)
1686
        # user can set verbose with params, it has higher priority
1687
        if not any(verbose_alias in params for verbose_alias in _ConfigAliases.get("verbosity")) and silent:
1688
            params["verbose"] = -1
wxchan's avatar
wxchan committed
1689
        if train_set is not None:
1690
            # Training task
wxchan's avatar
wxchan committed
1691
            if not isinstance(train_set, Dataset):
1692
1693
                raise TypeError('Training data should be Dataset instance, met {}'
                                .format(type(train_set).__name__))
wxchan's avatar
wxchan committed
1694
            params_str = param_dict_to_str(params)
1695
            # set network if necessary
1696
            for alias in _ConfigAliases.get("machines"):
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
                if alias in params:
                    machines = params[alias]
                    if isinstance(machines, string_type):
                        num_machines = len(machines.split(','))
                    elif isinstance(machines, (list, set)):
                        num_machines = len(machines)
                        machines = ','.join(machines)
                    else:
                        raise ValueError("Invalid machines in params.")
                    self.set_network(machines,
                                     local_listen_port=params.get("local_listen_port", 12400),
                                     listen_time_out=params.get("listen_time_out", 120),
                                     num_machines=params.get("num_machines", num_machines))
                    break
1711
            # construct booster object
1712
            self.handle = ctypes.c_void_p()
wxchan's avatar
wxchan committed
1713
            _safe_call(_LIB.LGBM_BoosterCreate(
wxchan's avatar
wxchan committed
1714
                train_set.construct().handle,
wxchan's avatar
wxchan committed
1715
1716
                c_str(params_str),
                ctypes.byref(self.handle)))
1717
            # save reference to data
wxchan's avatar
wxchan committed
1718
1719
1720
1721
            self.train_set = train_set
            self.valid_sets = []
            self.name_valid_sets = []
            self.__num_dataset = 1
Guolin Ke's avatar
Guolin Ke committed
1722
1723
            self.__init_predictor = train_set._predictor
            if self.__init_predictor is not None:
wxchan's avatar
wxchan committed
1724
1725
                _safe_call(_LIB.LGBM_BoosterMerge(
                    self.handle,
Guolin Ke's avatar
Guolin Ke committed
1726
                    self.__init_predictor.handle))
Guolin Ke's avatar
Guolin Ke committed
1727
            out_num_class = ctypes.c_int(0)
wxchan's avatar
wxchan committed
1728
1729
1730
1731
            _safe_call(_LIB.LGBM_BoosterGetNumClasses(
                self.handle,
                ctypes.byref(out_num_class)))
            self.__num_class = out_num_class.value
1732
            # buffer for inner predict
wxchan's avatar
wxchan committed
1733
1734
1735
            self.__inner_predict_buffer = [None]
            self.__is_predicted_cur_iter = [False]
            self.__get_eval_info()
1736
            self.pandas_categorical = train_set.pandas_categorical
wxchan's avatar
wxchan committed
1737
        elif model_file is not None:
1738
            # Prediction task
Guolin Ke's avatar
Guolin Ke committed
1739
            out_num_iterations = ctypes.c_int(0)
1740
            self.handle = ctypes.c_void_p()
wxchan's avatar
wxchan committed
1741
1742
1743
1744
            _safe_call(_LIB.LGBM_BoosterCreateFromModelfile(
                c_str(model_file),
                ctypes.byref(out_num_iterations),
                ctypes.byref(self.handle)))
Guolin Ke's avatar
Guolin Ke committed
1745
            out_num_class = ctypes.c_int(0)
wxchan's avatar
wxchan committed
1746
1747
1748
1749
            _safe_call(_LIB.LGBM_BoosterGetNumClasses(
                self.handle,
                ctypes.byref(out_num_class)))
            self.__num_class = out_num_class.value
1750
            self.pandas_categorical = _load_pandas_categorical(file_name=model_file)
1751
1752
        elif model_str is not None:
            self.model_from_string(model_str, not silent)
wxchan's avatar
wxchan committed
1753
        else:
1754
1755
            raise TypeError('Need at least one training dataset or model file or model string '
                            'to create Booster instance')
1756
        self.params = params
wxchan's avatar
wxchan committed
1757
1758

    def __del__(self):
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
        try:
            if self.network:
                self.free_network()
        except AttributeError:
            pass
        try:
            if self.handle is not None:
                _safe_call(_LIB.LGBM_BoosterFree(self.handle))
        except AttributeError:
            pass
wxchan's avatar
wxchan committed
1769

wxchan's avatar
wxchan committed
1770
1771
1772
1773
    def __copy__(self):
        return self.__deepcopy__(None)

    def __deepcopy__(self, _):
1774
        model_str = self.model_to_string(num_iteration=-1)
1775
        booster = Booster(model_str=model_str)
1776
        return booster
wxchan's avatar
wxchan committed
1777
1778
1779
1780
1781
1782
1783

    def __getstate__(self):
        this = self.__dict__.copy()
        handle = this['handle']
        this.pop('train_set', None)
        this.pop('valid_sets', None)
        if handle is not None:
1784
            this["handle"] = self.model_to_string(num_iteration=-1)
wxchan's avatar
wxchan committed
1785
1786
1787
        return this

    def __setstate__(self, state):
1788
1789
        model_str = state.get('handle', None)
        if model_str is not None:
wxchan's avatar
wxchan committed
1790
            handle = ctypes.c_void_p()
Guolin Ke's avatar
Guolin Ke committed
1791
            out_num_iterations = ctypes.c_int(0)
1792
1793
1794
1795
            _safe_call(_LIB.LGBM_BoosterLoadModelFromString(
                c_str(model_str),
                ctypes.byref(out_num_iterations),
                ctypes.byref(handle)))
wxchan's avatar
wxchan committed
1796
1797
1798
            state['handle'] = handle
        self.__dict__.update(state)

wxchan's avatar
wxchan committed
1799
    def free_dataset(self):
Nikita Titov's avatar
Nikita Titov committed
1800
1801
1802
1803
1804
1805
1806
        """Free Booster's Datasets.

        Returns
        -------
        self : Booster
            Booster without Datasets.
        """
wxchan's avatar
wxchan committed
1807
1808
        self.__dict__.pop('train_set', None)
        self.__dict__.pop('valid_sets', None)
1809
        self.__num_dataset = 0
Nikita Titov's avatar
Nikita Titov committed
1810
        return self
wxchan's avatar
wxchan committed
1811

1812
1813
1814
    def _free_buffer(self):
        self.__inner_predict_buffer = []
        self.__is_predicted_cur_iter = []
Nikita Titov's avatar
Nikita Titov committed
1815
        return self
1816

1817
1818
1819
1820
1821
1822
    def set_network(self, machines, local_listen_port=12400,
                    listen_time_out=120, num_machines=1):
        """Set the network configuration.

        Parameters
        ----------
Nikita Titov's avatar
Nikita Titov committed
1823
        machines : list, set or string
1824
            Names of machines.
Nikita Titov's avatar
Nikita Titov committed
1825
        local_listen_port : int, optional (default=12400)
1826
            TCP listen port for local machines.
Nikita Titov's avatar
Nikita Titov committed
1827
        listen_time_out : int, optional (default=120)
1828
            Socket time-out in minutes.
Nikita Titov's avatar
Nikita Titov committed
1829
        num_machines : int, optional (default=1)
1830
            The number of machines for parallel learning application.
Nikita Titov's avatar
Nikita Titov committed
1831
1832
1833
1834
1835

        Returns
        -------
        self : Booster
            Booster with set network.
1836
1837
1838
1839
1840
1841
        """
        _safe_call(_LIB.LGBM_NetworkInit(c_str(machines),
                                         ctypes.c_int(local_listen_port),
                                         ctypes.c_int(listen_time_out),
                                         ctypes.c_int(num_machines)))
        self.network = True
Nikita Titov's avatar
Nikita Titov committed
1842
        return self
1843
1844

    def free_network(self):
Nikita Titov's avatar
Nikita Titov committed
1845
1846
1847
1848
1849
1850
1851
        """Free Booster's network.

        Returns
        -------
        self : Booster
            Booster with freed network.
        """
1852
1853
        _safe_call(_LIB.LGBM_NetworkFree())
        self.network = False
Nikita Titov's avatar
Nikita Titov committed
1854
        return self
1855

wxchan's avatar
wxchan committed
1856
    def set_train_data_name(self, name):
1857
1858
1859
1860
        """Set the name to the training Dataset.

        Parameters
        ----------
Nikita Titov's avatar
Nikita Titov committed
1861
1862
1863
1864
1865
1866
1867
        name : string
            Name for the training Dataset.

        Returns
        -------
        self : Booster
            Booster with set training Dataset name.
1868
        """
1869
        self._train_data_name = name
Nikita Titov's avatar
Nikita Titov committed
1870
        return self
wxchan's avatar
wxchan committed
1871
1872

    def add_valid(self, data, name):
1873
        """Add validation data.
wxchan's avatar
wxchan committed
1874
1875
1876
1877

        Parameters
        ----------
        data : Dataset
1878
1879
1880
            Validation data.
        name : string
            Name of validation data.
Nikita Titov's avatar
Nikita Titov committed
1881
1882
1883
1884
1885

        Returns
        -------
        self : Booster
            Booster with set validation data.
wxchan's avatar
wxchan committed
1886
        """
Guolin Ke's avatar
Guolin Ke committed
1887
        if not isinstance(data, Dataset):
1888
1889
            raise TypeError('Validation data should be Dataset instance, met {}'
                            .format(type(data).__name__))
Guolin Ke's avatar
Guolin Ke committed
1890
        if data._predictor is not self.__init_predictor:
1891
1892
            raise LightGBMError("Add validation data failed, "
                                "you should use same predictor for these data")
wxchan's avatar
wxchan committed
1893
1894
        _safe_call(_LIB.LGBM_BoosterAddValidData(
            self.handle,
wxchan's avatar
wxchan committed
1895
            data.construct().handle))
wxchan's avatar
wxchan committed
1896
1897
1898
1899
1900
        self.valid_sets.append(data)
        self.name_valid_sets.append(name)
        self.__num_dataset += 1
        self.__inner_predict_buffer.append(None)
        self.__is_predicted_cur_iter.append(False)
Nikita Titov's avatar
Nikita Titov committed
1901
        return self
wxchan's avatar
wxchan committed
1902
1903

    def reset_parameter(self, params):
1904
        """Reset parameters of Booster.
wxchan's avatar
wxchan committed
1905
1906
1907
1908

        Parameters
        ----------
        params : dict
1909
            New parameters for Booster.
Nikita Titov's avatar
Nikita Titov committed
1910
1911
1912
1913
1914

        Returns
        -------
        self : Booster
            Booster with new parameters.
wxchan's avatar
wxchan committed
1915
        """
1916
        if any(metric_alias in params for metric_alias in _ConfigAliases.get("metric")):
wxchan's avatar
wxchan committed
1917
1918
1919
1920
1921
1922
            self.__need_reload_eval_info = True
        params_str = param_dict_to_str(params)
        if params_str:
            _safe_call(_LIB.LGBM_BoosterResetParameter(
                self.handle,
                c_str(params_str)))
Guolin Ke's avatar
Guolin Ke committed
1923
        self.params.update(params)
Nikita Titov's avatar
Nikita Titov committed
1924
        return self
wxchan's avatar
wxchan committed
1925
1926

    def update(self, train_set=None, fobj=None):
Nikita Titov's avatar
Nikita Titov committed
1927
        """Update Booster for one iteration.
1928

wxchan's avatar
wxchan committed
1929
1930
        Parameters
        ----------
1931
1932
1933
1934
        train_set : Dataset or None, optional (default=None)
            Training data.
            If None, last training data is used.
        fobj : callable or None, optional (default=None)
wxchan's avatar
wxchan committed
1935
            Customized objective function.
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
            Should accept two parameters: preds, train_data,
            and return (grad, hess).

                preds : list or numpy 1-D array
                    The predicted values.
                train_data : Dataset
                    The training dataset.
                grad : list or numpy 1-D array
                    The value of the first order derivative (gradient) for each sample point.
                hess : list or numpy 1-D array
                    The value of the second order derivative (Hessian) for each sample point.
wxchan's avatar
wxchan committed
1947

1948
1949
            For multi-class task, the preds is group by class_id first, then group by row_id.
            If you want to get i-th row preds in j-th class, the access way is score[j * num_data + i]
1950
1951
            and you should group grad and hess in this way as well.

wxchan's avatar
wxchan committed
1952
1953
        Returns
        -------
1954
1955
        is_finished : bool
            Whether the update was successfully finished.
wxchan's avatar
wxchan committed
1956
        """
1957
        # need reset training data
wxchan's avatar
wxchan committed
1958
        if train_set is not None and train_set is not self.train_set:
Guolin Ke's avatar
Guolin Ke committed
1959
            if not isinstance(train_set, Dataset):
1960
1961
                raise TypeError('Training data should be Dataset instance, met {}'
                                .format(type(train_set).__name__))
Guolin Ke's avatar
Guolin Ke committed
1962
            if train_set._predictor is not self.__init_predictor:
1963
1964
                raise LightGBMError("Replace training data failed, "
                                    "you should use same predictor for these data")
wxchan's avatar
wxchan committed
1965
1966
1967
            self.train_set = train_set
            _safe_call(_LIB.LGBM_BoosterResetTrainingData(
                self.handle,
wxchan's avatar
wxchan committed
1968
                self.train_set.construct().handle))
wxchan's avatar
wxchan committed
1969
1970
1971
            self.__inner_predict_buffer[0] = None
        is_finished = ctypes.c_int(0)
        if fobj is None:
1972
            if self.__set_objective_to_none:
1973
                raise LightGBMError('Cannot update due to null objective function.')
wxchan's avatar
wxchan committed
1974
1975
1976
            _safe_call(_LIB.LGBM_BoosterUpdateOneIter(
                self.handle,
                ctypes.byref(is_finished)))
wxchan's avatar
wxchan committed
1977
            self.__is_predicted_cur_iter = [False for _ in range_(self.__num_dataset)]
wxchan's avatar
wxchan committed
1978
1979
            return is_finished.value == 1
        else:
1980
            if not self.__set_objective_to_none:
Nikita Titov's avatar
Nikita Titov committed
1981
                self.reset_parameter({"objective": "none"}).__set_objective_to_none = True
wxchan's avatar
wxchan committed
1982
1983
1984
1985
            grad, hess = fobj(self.__inner_predict(0), self.train_set)
            return self.__boost(grad, hess)

    def __boost(self, grad, hess):
1986
        """Boost Booster for one iteration with customized gradient statistics.
Nikita Titov's avatar
Nikita Titov committed
1987

Nikita Titov's avatar
Nikita Titov committed
1988
1989
1990
1991
1992
        .. note::

            For multi-class task, the score is group by class_id first, then group by row_id.
            If you want to get i-th row score in j-th class, the access way is score[j * num_data + i]
            and you should group grad and hess in this way as well.
1993

wxchan's avatar
wxchan committed
1994
1995
        Parameters
        ----------
1996
        grad : list or numpy 1-D array
Nikita Titov's avatar
Nikita Titov committed
1997
            The first order derivative (gradient).
1998
        hess : list or numpy 1-D array
Nikita Titov's avatar
Nikita Titov committed
1999
            The second order derivative (Hessian).
wxchan's avatar
wxchan committed
2000
2001
2002

        Returns
        -------
Nikita Titov's avatar
Nikita Titov committed
2003
2004
        is_finished : bool
            Whether the boost was successfully finished.
wxchan's avatar
wxchan committed
2005
        """
2006
2007
        grad = list_to_1d_numpy(grad, name='gradient')
        hess = list_to_1d_numpy(hess, name='hessian')
2008
2009
        assert grad.flags.c_contiguous
        assert hess.flags.c_contiguous
wxchan's avatar
wxchan committed
2010
        if len(grad) != len(hess):
2011
2012
            raise ValueError("Lengths of gradient({}) and hessian({}) don't match"
                             .format(len(grad), len(hess)))
wxchan's avatar
wxchan committed
2013
2014
2015
2016
2017
2018
        is_finished = ctypes.c_int(0)
        _safe_call(_LIB.LGBM_BoosterUpdateOneIterCustom(
            self.handle,
            grad.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
            hess.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
            ctypes.byref(is_finished)))
wxchan's avatar
wxchan committed
2019
        self.__is_predicted_cur_iter = [False for _ in range_(self.__num_dataset)]
wxchan's avatar
wxchan committed
2020
2021
2022
        return is_finished.value == 1

    def rollback_one_iter(self):
Nikita Titov's avatar
Nikita Titov committed
2023
2024
2025
2026
2027
2028
2029
        """Rollback one iteration.

        Returns
        -------
        self : Booster
            Booster with rolled back one iteration.
        """
wxchan's avatar
wxchan committed
2030
2031
        _safe_call(_LIB.LGBM_BoosterRollbackOneIter(
            self.handle))
wxchan's avatar
wxchan committed
2032
        self.__is_predicted_cur_iter = [False for _ in range_(self.__num_dataset)]
Nikita Titov's avatar
Nikita Titov committed
2033
        return self
wxchan's avatar
wxchan committed
2034
2035

    def current_iteration(self):
2036
2037
2038
2039
2040
2041
2042
        """Get the index of the current iteration.

        Returns
        -------
        cur_iter : int
            The index of the current iteration.
        """
Guolin Ke's avatar
Guolin Ke committed
2043
        out_cur_iter = ctypes.c_int(0)
wxchan's avatar
wxchan committed
2044
2045
2046
2047
2048
        _safe_call(_LIB.LGBM_BoosterGetCurrentIteration(
            self.handle,
            ctypes.byref(out_cur_iter)))
        return out_cur_iter.value

2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
    def num_model_per_iteration(self):
        """Get number of models per iteration.

        Returns
        -------
        model_per_iter : int
            The number of models per iteration.
        """
        model_per_iter = ctypes.c_int(0)
        _safe_call(_LIB.LGBM_BoosterNumModelPerIteration(
            self.handle,
            ctypes.byref(model_per_iter)))
        return model_per_iter.value

    def num_trees(self):
        """Get number of weak sub-models.

        Returns
        -------
        num_trees : int
            The number of weak sub-models.
        """
        num_trees = ctypes.c_int(0)
        _safe_call(_LIB.LGBM_BoosterNumberOfTotalModel(
            self.handle,
            ctypes.byref(num_trees)))
        return num_trees.value

wxchan's avatar
wxchan committed
2077
    def eval(self, data, name, feval=None):
2078
        """Evaluate for data.
wxchan's avatar
wxchan committed
2079
2080
2081

        Parameters
        ----------
2082
2083
2084
2085
2086
        data : Dataset
            Data for the evaluating.
        name : string
            Name of the data.
        feval : callable or None, optional (default=None)
2087
            Customized evaluation function.
2088
            Should accept two parameters: preds, eval_data,
2089
            and return (eval_name, eval_result, is_higher_better) or list of such tuples.
2090
2091
2092
2093
2094
2095

                preds : list or numpy 1-D array
                    The predicted values.
                eval_data : Dataset
                    The evaluation dataset.
                eval_name : string
2096
                    The name of evaluation function (without whitespaces).
2097
2098
2099
2100
2101
                eval_result : float
                    The eval result.
                is_higher_better : bool
                    Is eval result higher better, e.g. AUC is ``is_higher_better``.

2102
2103
            For multi-class task, the preds is group by class_id first, then group by row_id.
            If you want to get i-th row preds in j-th class, the access way is preds[j * num_data + i].
2104

wxchan's avatar
wxchan committed
2105
2106
        Returns
        -------
Nikita Titov's avatar
Nikita Titov committed
2107
        result : list
2108
            List with evaluation results.
wxchan's avatar
wxchan committed
2109
        """
Guolin Ke's avatar
Guolin Ke committed
2110
2111
        if not isinstance(data, Dataset):
            raise TypeError("Can only eval for Dataset instance")
wxchan's avatar
wxchan committed
2112
2113
2114
2115
        data_idx = -1
        if data is self.train_set:
            data_idx = 0
        else:
wxchan's avatar
wxchan committed
2116
            for i in range_(len(self.valid_sets)):
wxchan's avatar
wxchan committed
2117
2118
2119
                if data is self.valid_sets[i]:
                    data_idx = i + 1
                    break
2120
        # need to push new valid data
wxchan's avatar
wxchan committed
2121
2122
2123
2124
2125
2126
2127
        if data_idx == -1:
            self.add_valid(data, name)
            data_idx = self.__num_dataset - 1

        return self.__inner_eval(name, data_idx, feval)

    def eval_train(self, feval=None):
2128
        """Evaluate for training data.
wxchan's avatar
wxchan committed
2129
2130
2131

        Parameters
        ----------
2132
        feval : callable or None, optional (default=None)
2133
            Customized evaluation function.
2134
2135
            Should accept two parameters: preds, train_data,
            and return (eval_name, eval_result, is_higher_better) or list of such tuples.
2136
2137
2138
2139
2140
2141

                preds : list or numpy 1-D array
                    The predicted values.
                train_data : Dataset
                    The training dataset.
                eval_name : string
2142
                    The name of evaluation function (without whitespaces).
2143
2144
2145
2146
2147
                eval_result : float
                    The eval result.
                is_higher_better : bool
                    Is eval result higher better, e.g. AUC is ``is_higher_better``.

2148
2149
            For multi-class task, the preds is group by class_id first, then group by row_id.
            If you want to get i-th row preds in j-th class, the access way is preds[j * num_data + i].
wxchan's avatar
wxchan committed
2150
2151
2152

        Returns
        -------
Nikita Titov's avatar
Nikita Titov committed
2153
        result : list
2154
            List with evaluation results.
wxchan's avatar
wxchan committed
2155
        """
2156
        return self.__inner_eval(self._train_data_name, 0, feval)
wxchan's avatar
wxchan committed
2157
2158

    def eval_valid(self, feval=None):
2159
        """Evaluate for validation data.
wxchan's avatar
wxchan committed
2160
2161
2162

        Parameters
        ----------
2163
        feval : callable or None, optional (default=None)
2164
            Customized evaluation function.
2165
            Should accept two parameters: preds, valid_data,
2166
            and return (eval_name, eval_result, is_higher_better) or list of such tuples.
2167
2168
2169
2170
2171
2172

                preds : list or numpy 1-D array
                    The predicted values.
                valid_data : Dataset
                    The validation dataset.
                eval_name : string
2173
                    The name of evaluation function (without whitespaces).
2174
2175
2176
2177
2178
                eval_result : float
                    The eval result.
                is_higher_better : bool
                    Is eval result higher better, e.g. AUC is ``is_higher_better``.

2179
2180
            For multi-class task, the preds is group by class_id first, then group by row_id.
            If you want to get i-th row preds in j-th class, the access way is preds[j * num_data + i].
wxchan's avatar
wxchan committed
2181
2182
2183

        Returns
        -------
Nikita Titov's avatar
Nikita Titov committed
2184
        result : list
2185
            List with evaluation results.
wxchan's avatar
wxchan committed
2186
        """
wxchan's avatar
wxchan committed
2187
        return [item for i in range_(1, self.__num_dataset)
wxchan's avatar
wxchan committed
2188
                for item in self.__inner_eval(self.name_valid_sets[i - 1], i, feval)]
wxchan's avatar
wxchan committed
2189

2190
    def save_model(self, filename, num_iteration=None, start_iteration=0):
2191
        """Save Booster to file.
wxchan's avatar
wxchan committed
2192
2193
2194

        Parameters
        ----------
2195
2196
        filename : string
            Filename to save Booster.
2197
2198
2199
2200
        num_iteration : int or None, optional (default=None)
            Index of the iteration that should be saved.
            If None, if the best iteration exists, it is saved; otherwise, all iterations are saved.
            If <= 0, all iterations are saved.
Nikita Titov's avatar
Nikita Titov committed
2201
        start_iteration : int, optional (default=0)
2202
            Start index of the iteration that should be saved.
Nikita Titov's avatar
Nikita Titov committed
2203
2204
2205
2206
2207

        Returns
        -------
        self : Booster
            Returns self.
wxchan's avatar
wxchan committed
2208
        """
2209
        if num_iteration is None:
2210
            num_iteration = self.best_iteration
wxchan's avatar
wxchan committed
2211
2212
        _safe_call(_LIB.LGBM_BoosterSaveModel(
            self.handle,
2213
            ctypes.c_int(start_iteration),
Guolin Ke's avatar
Guolin Ke committed
2214
            ctypes.c_int(num_iteration),
wxchan's avatar
wxchan committed
2215
            c_str(filename)))
2216
        _dump_pandas_categorical(self.pandas_categorical, filename)
Nikita Titov's avatar
Nikita Titov committed
2217
        return self
wxchan's avatar
wxchan committed
2218

2219
    def shuffle_models(self, start_iteration=0, end_iteration=-1):
2220
        """Shuffle models.
Nikita Titov's avatar
Nikita Titov committed
2221

2222
2223
2224
        Parameters
        ----------
        start_iteration : int, optional (default=0)
2225
            The first iteration that will be shuffled.
2226
2227
        end_iteration : int, optional (default=-1)
            The last iteration that will be shuffled.
2228
            If <= 0, means the last available iteration.
2229

Nikita Titov's avatar
Nikita Titov committed
2230
2231
2232
2233
        Returns
        -------
        self : Booster
            Booster with shuffled models.
2234
        """
2235
2236
        _safe_call(_LIB.LGBM_BoosterShuffleModels(
            self.handle,
Guolin Ke's avatar
Guolin Ke committed
2237
2238
            ctypes.c_int(start_iteration),
            ctypes.c_int(end_iteration)))
Nikita Titov's avatar
Nikita Titov committed
2239
        return self
2240
2241
2242
2243
2244
2245

    def model_from_string(self, model_str, verbose=True):
        """Load Booster from a string.

        Parameters
        ----------
Nikita Titov's avatar
Nikita Titov committed
2246
        model_str : string
2247
            Model will be loaded from this string.
Nikita Titov's avatar
Nikita Titov committed
2248
2249
        verbose : bool, optional (default=True)
            Whether to print messages while loading model.
2250
2251
2252

        Returns
        -------
Nikita Titov's avatar
Nikita Titov committed
2253
        self : Booster
2254
2255
            Loaded Booster object.
        """
2256
2257
2258
2259
        if self.handle is not None:
            _safe_call(_LIB.LGBM_BoosterFree(self.handle))
        self._free_buffer()
        self.handle = ctypes.c_void_p()
2260
2261
2262
2263
2264
2265
2266
2267
2268
        out_num_iterations = ctypes.c_int(0)
        _safe_call(_LIB.LGBM_BoosterLoadModelFromString(
            c_str(model_str),
            ctypes.byref(out_num_iterations),
            ctypes.byref(self.handle)))
        out_num_class = ctypes.c_int(0)
        _safe_call(_LIB.LGBM_BoosterGetNumClasses(
            self.handle,
            ctypes.byref(out_num_class)))
2269
        if verbose:
Nikita Titov's avatar
Nikita Titov committed
2270
            print('Finished loading model, total used %d iterations' % int(out_num_iterations.value))
2271
        self.__num_class = out_num_class.value
2272
        self.pandas_categorical = _load_pandas_categorical(model_str=model_str)
2273
2274
2275
2276
        return self

    def model_to_string(self, num_iteration=None, start_iteration=0):
        """Save Booster to string.
2277

2278
2279
2280
2281
2282
2283
        Parameters
        ----------
        num_iteration : int or None, optional (default=None)
            Index of the iteration that should be saved.
            If None, if the best iteration exists, it is saved; otherwise, all iterations are saved.
            If <= 0, all iterations are saved.
Nikita Titov's avatar
Nikita Titov committed
2284
        start_iteration : int, optional (default=0)
2285
2286
2287
2288
            Start index of the iteration that should be saved.

        Returns
        -------
Nikita Titov's avatar
Nikita Titov committed
2289
        str_repr : string
2290
2291
            String representation of Booster.
        """
2292
        if num_iteration is None:
2293
2294
            num_iteration = self.best_iteration
        buffer_len = 1 << 20
2295
        tmp_out_len = ctypes.c_int64(0)
2296
2297
2298
2299
        string_buffer = ctypes.create_string_buffer(buffer_len)
        ptr_string_buffer = ctypes.c_char_p(*[ctypes.addressof(string_buffer)])
        _safe_call(_LIB.LGBM_BoosterSaveModelToString(
            self.handle,
2300
            ctypes.c_int(start_iteration),
2301
            ctypes.c_int(num_iteration),
2302
            ctypes.c_int64(buffer_len),
2303
2304
2305
            ctypes.byref(tmp_out_len),
            ptr_string_buffer))
        actual_len = tmp_out_len.value
2306
        # if buffer length is not long enough, re-allocate a buffer
2307
2308
2309
2310
2311
        if actual_len > buffer_len:
            string_buffer = ctypes.create_string_buffer(actual_len)
            ptr_string_buffer = ctypes.c_char_p(*[ctypes.addressof(string_buffer)])
            _safe_call(_LIB.LGBM_BoosterSaveModelToString(
                self.handle,
2312
                ctypes.c_int(start_iteration),
2313
                ctypes.c_int(num_iteration),
2314
                ctypes.c_int64(actual_len),
2315
2316
                ctypes.byref(tmp_out_len),
                ptr_string_buffer))
2317
2318
2319
        ret = string_buffer.value.decode()
        ret += _dump_pandas_categorical(self.pandas_categorical)
        return ret
2320

2321
    def dump_model(self, num_iteration=None, start_iteration=0):
Nikita Titov's avatar
Nikita Titov committed
2322
        """Dump Booster to JSON format.
wxchan's avatar
wxchan committed
2323

2324
2325
        Parameters
        ----------
2326
2327
2328
2329
        num_iteration : int or None, optional (default=None)
            Index of the iteration that should be dumped.
            If None, if the best iteration exists, it is dumped; otherwise, all iterations are dumped.
            If <= 0, all iterations are dumped.
Nikita Titov's avatar
Nikita Titov committed
2330
        start_iteration : int, optional (default=0)
2331
            Start index of the iteration that should be dumped.
2332

wxchan's avatar
wxchan committed
2333
2334
        Returns
        -------
2335
        json_repr : dict
Nikita Titov's avatar
Nikita Titov committed
2336
            JSON format of Booster.
wxchan's avatar
wxchan committed
2337
        """
2338
        if num_iteration is None:
2339
            num_iteration = self.best_iteration
wxchan's avatar
wxchan committed
2340
        buffer_len = 1 << 20
2341
        tmp_out_len = ctypes.c_int64(0)
wxchan's avatar
wxchan committed
2342
2343
2344
2345
        string_buffer = ctypes.create_string_buffer(buffer_len)
        ptr_string_buffer = ctypes.c_char_p(*[ctypes.addressof(string_buffer)])
        _safe_call(_LIB.LGBM_BoosterDumpModel(
            self.handle,
2346
            ctypes.c_int(start_iteration),
Guolin Ke's avatar
Guolin Ke committed
2347
            ctypes.c_int(num_iteration),
2348
            ctypes.c_int64(buffer_len),
wxchan's avatar
wxchan committed
2349
            ctypes.byref(tmp_out_len),
Guolin Ke's avatar
Guolin Ke committed
2350
            ptr_string_buffer))
wxchan's avatar
wxchan committed
2351
        actual_len = tmp_out_len.value
2352
        # if buffer length is not long enough, reallocate a buffer
wxchan's avatar
wxchan committed
2353
2354
2355
2356
2357
        if actual_len > buffer_len:
            string_buffer = ctypes.create_string_buffer(actual_len)
            ptr_string_buffer = ctypes.c_char_p(*[ctypes.addressof(string_buffer)])
            _safe_call(_LIB.LGBM_BoosterDumpModel(
                self.handle,
2358
                ctypes.c_int(start_iteration),
Guolin Ke's avatar
Guolin Ke committed
2359
                ctypes.c_int(num_iteration),
2360
                ctypes.c_int64(actual_len),
wxchan's avatar
wxchan committed
2361
                ctypes.byref(tmp_out_len),
Guolin Ke's avatar
Guolin Ke committed
2362
                ptr_string_buffer))
2363
2364
2365
2366
        ret = json.loads(string_buffer.value.decode())
        ret['pandas_categorical'] = json.loads(json.dumps(self.pandas_categorical,
                                                          default=json_default_with_numpy))
        return ret
wxchan's avatar
wxchan committed
2367

2368
2369
    def predict(self, data, num_iteration=None,
                raw_score=False, pred_leaf=False, pred_contrib=False,
2370
                data_has_header=False, is_reshape=True, **kwargs):
2371
        """Make a prediction.
wxchan's avatar
wxchan committed
2372
2373
2374

        Parameters
        ----------
2375
        data : string, numpy array, pandas DataFrame, H2O DataTable's Frame or scipy.sparse
2376
2377
            Data source for prediction.
            If string, it represents the path to txt file.
2378
2379
2380
2381
        num_iteration : int or None, optional (default=None)
            Limit number of iterations in the prediction.
            If None, if the best iteration exists, it is used; otherwise, all iterations are used.
            If <= 0, all iterations are used (no limits).
2382
2383
2384
2385
        raw_score : bool, optional (default=False)
            Whether to predict raw scores.
        pred_leaf : bool, optional (default=False)
            Whether to predict leaf index.
2386
2387
        pred_contrib : bool, optional (default=False)
            Whether to predict feature contributions.
2388

Nikita Titov's avatar
Nikita Titov committed
2389
2390
2391
2392
2393
2394
2395
            .. note::

                If you want to get more explanations for your model's predictions using SHAP values,
                like SHAP interaction values,
                you can install the shap package (https://github.com/slundberg/shap).
                Note that unlike the shap package, with ``pred_contrib`` we return a matrix with an extra
                column, where the last column is the expected value.
2396

2397
2398
2399
2400
2401
        data_has_header : bool, optional (default=False)
            Whether the data has header.
            Used only if data is string.
        is_reshape : bool, optional (default=True)
            If True, result is reshaped to [nrow, ncol].
2402
2403
        **kwargs
            Other parameters for the prediction.
wxchan's avatar
wxchan committed
2404
2405
2406

        Returns
        -------
2407
2408
        result : numpy array
            Prediction result.
wxchan's avatar
wxchan committed
2409
        """
2410
        predictor = self._to_predictor(copy.deepcopy(kwargs))
2411
        if num_iteration is None:
2412
            num_iteration = self.best_iteration
2413
2414
2415
        return predictor.predict(data, num_iteration,
                                 raw_score, pred_leaf, pred_contrib,
                                 data_has_header, is_reshape)
wxchan's avatar
wxchan committed
2416

2417
    def refit(self, data, label, decay_rate=0.9, **kwargs):
Guolin Ke's avatar
Guolin Ke committed
2418
2419
2420
2421
        """Refit the existing Booster by new data.

        Parameters
        ----------
2422
        data : string, numpy array, pandas DataFrame, H2O DataTable's Frame or scipy.sparse
Guolin Ke's avatar
Guolin Ke committed
2423
2424
            Data source for refit.
            If string, it represents the path to txt file.
2425
        label : list, numpy 1-D array or pandas Series / one-column DataFrame
Guolin Ke's avatar
Guolin Ke committed
2426
2427
            Label for refit.
        decay_rate : float, optional (default=0.9)
2428
2429
            Decay rate of refit,
            will use ``leaf_output = decay_rate * old_leaf_output + (1.0 - decay_rate) * new_leaf_output`` to refit trees.
2430
2431
        **kwargs
            Other parameters for refit.
2432
            These parameters will be passed to ``predict`` method.
Guolin Ke's avatar
Guolin Ke committed
2433
2434
2435
2436
2437
2438

        Returns
        -------
        result : Booster
            Refitted Booster.
        """
2439
2440
        if self.__set_objective_to_none:
            raise LightGBMError('Cannot refit due to null objective function.')
2441
        predictor = self._to_predictor(copy.deepcopy(kwargs))
2442
        leaf_preds = predictor.predict(data, -1, pred_leaf=True)
2443
        nrow, ncol = leaf_preds.shape
2444
        train_set = Dataset(data, label, silent=True)
2445
2446
2447
        new_params = copy.deepcopy(self.params)
        new_params['refit_decay_rate'] = decay_rate
        new_booster = Booster(new_params, train_set, silent=True)
Guolin Ke's avatar
Guolin Ke committed
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
        # Copy models
        _safe_call(_LIB.LGBM_BoosterMerge(
            new_booster.handle,
            predictor.handle))
        leaf_preds = leaf_preds.reshape(-1)
        ptr_data, type_ptr_data, _ = c_int_array(leaf_preds)
        _safe_call(_LIB.LGBM_BoosterRefit(
            new_booster.handle,
            ptr_data,
            ctypes.c_int(nrow),
            ctypes.c_int(ncol)))
2459
2460
        new_booster.network = self.network
        new_booster.__attr = self.__attr.copy()
Guolin Ke's avatar
Guolin Ke committed
2461
2462
        return new_booster

2463
    def get_leaf_output(self, tree_id, leaf_id):
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
        """Get the output of a leaf.

        Parameters
        ----------
        tree_id : int
            The index of the tree.
        leaf_id : int
            The index of the leaf in the tree.

        Returns
        -------
        result : float
            The output of the leaf.
        """
2478
2479
2480
2481
2482
2483
2484
2485
        ret = ctypes.c_double(0)
        _safe_call(_LIB.LGBM_BoosterGetLeafValue(
            self.handle,
            ctypes.c_int(tree_id),
            ctypes.c_int(leaf_id),
            ctypes.byref(ret)))
        return ret.value

2486
    def _to_predictor(self, pred_parameter=None):
2487
        """Convert to predictor."""
2488
        predictor = _InnerPredictor(booster_handle=self.handle, pred_parameter=pred_parameter)
2489
        predictor.pandas_categorical = self.pandas_categorical
wxchan's avatar
wxchan committed
2490
2491
        return predictor

2492
    def num_feature(self):
2493
2494
2495
2496
2497
2498
2499
        """Get number of features.

        Returns
        -------
        num_feature : int
            The number of features.
        """
2500
2501
2502
2503
2504
2505
        out_num_feature = ctypes.c_int(0)
        _safe_call(_LIB.LGBM_BoosterGetNumFeature(
            self.handle,
            ctypes.byref(out_num_feature)))
        return out_num_feature.value

wxchan's avatar
wxchan committed
2506
    def feature_name(self):
2507
        """Get names of features.
wxchan's avatar
wxchan committed
2508
2509
2510

        Returns
        -------
2511
2512
        result : list
            List with names of features.
wxchan's avatar
wxchan committed
2513
        """
2514
        num_feature = self.num_feature()
2515
        # Get name of features
wxchan's avatar
wxchan committed
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
        tmp_out_len = ctypes.c_int(0)
        string_buffers = [ctypes.create_string_buffer(255) for i in range_(num_feature)]
        ptr_string_buffers = (ctypes.c_char_p * num_feature)(*map(ctypes.addressof, string_buffers))
        _safe_call(_LIB.LGBM_BoosterGetFeatureNames(
            self.handle,
            ctypes.byref(tmp_out_len),
            ptr_string_buffers))
        if num_feature != tmp_out_len.value:
            raise ValueError("Length of feature names doesn't equal with num_feature")
        return [string_buffers[i].value.decode() for i in range_(num_feature)]

2527
    def feature_importance(self, importance_type='split', iteration=None):
2528
        """Get feature importances.
2529

2530
2531
        Parameters
        ----------
2532
2533
2534
2535
        importance_type : string, optional (default="split")
            How the importance is calculated.
            If "split", result contains numbers of times the feature is used in a model.
            If "gain", result contains total gains of splits which use the feature.
2536
2537
2538
2539
        iteration : int or None, optional (default=None)
            Limit number of iterations in the feature importance calculation.
            If None, if the best iteration exists, it is used; otherwise, all trees are used.
            If <= 0, all trees are used (no limits).
2540

2541
2542
        Returns
        -------
2543
2544
        result : numpy array
            Array with feature importances.
2545
        """
2546
2547
        if iteration is None:
            iteration = self.best_iteration
2548
2549
2550
2551
2552
2553
        if importance_type == "split":
            importance_type_int = 0
        elif importance_type == "gain":
            importance_type_int = 1
        else:
            importance_type_int = -1
Nikita Titov's avatar
Nikita Titov committed
2554
        result = np.zeros(self.num_feature(), dtype=np.float64)
2555
2556
2557
2558
2559
2560
        _safe_call(_LIB.LGBM_BoosterFeatureImportance(
            self.handle,
            ctypes.c_int(iteration),
            ctypes.c_int(importance_type_int),
            result.ctypes.data_as(ctypes.POINTER(ctypes.c_double))))
        if importance_type_int == 0:
2561
            return result.astype(np.int32)
2562
2563
        else:
            return result
2564

2565
2566
2567
2568
2569
2570
2571
2572
2573
    def get_split_value_histogram(self, feature, bins=None, xgboost_style=False):
        """Get split value histogram for the specified feature.

        Parameters
        ----------
        feature : int or string
            The feature name or index the histogram is calculated for.
            If int, interpreted as index.
            If string, interpreted as name.
2574

Nikita Titov's avatar
Nikita Titov committed
2575
2576
2577
            .. warning::

                Categorical features are not supported.
2578

2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
        bins : int, string or None, optional (default=None)
            The maximum number of bins.
            If None, or int and > number of unique split values and ``xgboost_style=True``,
            the number of bins equals number of unique split values.
            If string, it should be one from the list of the supported values by ``numpy.histogram()`` function.
        xgboost_style : bool, optional (default=False)
            Whether the returned result should be in the same form as it is in XGBoost.
            If False, the returned value is tuple of 2 numpy arrays as it is in ``numpy.histogram()`` function.
            If True, the returned value is matrix, in which the first column is the right edges of non-empty bins
            and the second one is the histogram values.

        Returns
        -------
        result_tuple : tuple of 2 numpy arrays
            If ``xgboost_style=False``, the values of the histogram of used splitting values for the specified feature
            and the bin edges.
        result_array_like : numpy array or pandas DataFrame (if pandas is installed)
            If ``xgboost_style=True``, the histogram of used splitting values for the specified feature.
        """
        def add(root):
            """Recursively add thresholds."""
            if 'split_index' in root:  # non-leaf
                if feature_names is not None and isinstance(feature, string_type):
                    split_feature = feature_names[root['split_feature']]
                else:
                    split_feature = root['split_feature']
                if split_feature == feature:
2606
2607
2608
2609
                    if isinstance(root['threshold'], string_type):
                        raise LightGBMError('Cannot compute split value histogram for the categorical feature')
                    else:
                        values.append(root['threshold'])
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
                add(root['left_child'])
                add(root['right_child'])

        model = self.dump_model()
        feature_names = model.get('feature_names')
        tree_infos = model['tree_info']
        values = []
        for tree_info in tree_infos:
            add(tree_info['tree_structure'])

        if bins is None or isinstance(bins, integer_types) and xgboost_style:
            n_unique = len(np.unique(values))
            bins = max(min(n_unique, bins) if bins is not None else n_unique, 1)
        hist, bin_edges = np.histogram(values, bins=bins)
        if xgboost_style:
            ret = np.column_stack((bin_edges[1:], hist))
            ret = ret[ret[:, 1] > 0]
            if PANDAS_INSTALLED:
                return DataFrame(ret, columns=['SplitValue', 'Count'])
            else:
                return ret
        else:
            return hist, bin_edges

wxchan's avatar
wxchan committed
2634
    def __inner_eval(self, data_name, data_idx, feval=None):
2635
        """Evaluate training or validation data."""
wxchan's avatar
wxchan committed
2636
        if data_idx >= self.__num_dataset:
2637
            raise ValueError("Data_idx should be smaller than number of dataset")
wxchan's avatar
wxchan committed
2638
2639
2640
        self.__get_eval_info()
        ret = []
        if self.__num_inner_eval > 0:
2641
            result = np.zeros(self.__num_inner_eval, dtype=np.float64)
Guolin Ke's avatar
Guolin Ke committed
2642
            tmp_out_len = ctypes.c_int(0)
wxchan's avatar
wxchan committed
2643
2644
            _safe_call(_LIB.LGBM_BoosterGetEval(
                self.handle,
Guolin Ke's avatar
Guolin Ke committed
2645
                ctypes.c_int(data_idx),
wxchan's avatar
wxchan committed
2646
                ctypes.byref(tmp_out_len),
Guolin Ke's avatar
Guolin Ke committed
2647
                result.ctypes.data_as(ctypes.POINTER(ctypes.c_double))))
wxchan's avatar
wxchan committed
2648
            if tmp_out_len.value != self.__num_inner_eval:
2649
                raise ValueError("Wrong length of eval results")
wxchan's avatar
wxchan committed
2650
            for i in range_(self.__num_inner_eval):
2651
2652
                ret.append((data_name, self.__name_inner_eval[i],
                            result[i], self.__higher_better_inner_eval[i]))
wxchan's avatar
wxchan committed
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
        if feval is not None:
            if data_idx == 0:
                cur_data = self.train_set
            else:
                cur_data = self.valid_sets[data_idx - 1]
            feval_ret = feval(self.__inner_predict(data_idx), cur_data)
            if isinstance(feval_ret, list):
                for eval_name, val, is_higher_better in feval_ret:
                    ret.append((data_name, eval_name, val, is_higher_better))
            else:
                eval_name, val, is_higher_better = feval_ret
                ret.append((data_name, eval_name, val, is_higher_better))
        return ret

    def __inner_predict(self, data_idx):
2668
        """Predict for training and validation dataset."""
wxchan's avatar
wxchan committed
2669
        if data_idx >= self.__num_dataset:
2670
            raise ValueError("Data_idx should be smaller than number of dataset")
wxchan's avatar
wxchan committed
2671
2672
2673
2674
2675
        if self.__inner_predict_buffer[data_idx] is None:
            if data_idx == 0:
                n_preds = self.train_set.num_data() * self.__num_class
            else:
                n_preds = self.valid_sets[data_idx - 1].num_data() * self.__num_class
2676
            self.__inner_predict_buffer[data_idx] = np.zeros(n_preds, dtype=np.float64)
2677
        # avoid to predict many time in one iteration
wxchan's avatar
wxchan committed
2678
2679
        if not self.__is_predicted_cur_iter[data_idx]:
            tmp_out_len = ctypes.c_int64(0)
Guolin Ke's avatar
Guolin Ke committed
2680
            data_ptr = self.__inner_predict_buffer[data_idx].ctypes.data_as(ctypes.POINTER(ctypes.c_double))
wxchan's avatar
wxchan committed
2681
2682
            _safe_call(_LIB.LGBM_BoosterGetPredict(
                self.handle,
Guolin Ke's avatar
Guolin Ke committed
2683
                ctypes.c_int(data_idx),
wxchan's avatar
wxchan committed
2684
2685
2686
                ctypes.byref(tmp_out_len),
                data_ptr))
            if tmp_out_len.value != len(self.__inner_predict_buffer[data_idx]):
2687
                raise ValueError("Wrong length of predict results for data %d" % (data_idx))
wxchan's avatar
wxchan committed
2688
2689
2690
2691
            self.__is_predicted_cur_iter[data_idx] = True
        return self.__inner_predict_buffer[data_idx]

    def __get_eval_info(self):
2692
        """Get inner evaluation count and names."""
wxchan's avatar
wxchan committed
2693
2694
        if self.__need_reload_eval_info:
            self.__need_reload_eval_info = False
Guolin Ke's avatar
Guolin Ke committed
2695
            out_num_eval = ctypes.c_int(0)
2696
            # Get num of inner evals
wxchan's avatar
wxchan committed
2697
2698
2699
2700
2701
            _safe_call(_LIB.LGBM_BoosterGetEvalCounts(
                self.handle,
                ctypes.byref(out_num_eval)))
            self.__num_inner_eval = out_num_eval.value
            if self.__num_inner_eval > 0:
2702
                # Get name of evals
Guolin Ke's avatar
Guolin Ke committed
2703
                tmp_out_len = ctypes.c_int(0)
wxchan's avatar
wxchan committed
2704
                string_buffers = [ctypes.create_string_buffer(255) for i in range_(self.__num_inner_eval)]
wxchan's avatar
wxchan committed
2705
                ptr_string_buffers = (ctypes.c_char_p * self.__num_inner_eval)(*map(ctypes.addressof, string_buffers))
wxchan's avatar
wxchan committed
2706
2707
2708
2709
2710
                _safe_call(_LIB.LGBM_BoosterGetEvalNames(
                    self.handle,
                    ctypes.byref(tmp_out_len),
                    ptr_string_buffers))
                if self.__num_inner_eval != tmp_out_len.value:
2711
                    raise ValueError("Length of eval names doesn't equal with num_evals")
2712
                self.__name_inner_eval = \
wxchan's avatar
wxchan committed
2713
                    [string_buffers[i].value.decode() for i in range_(self.__num_inner_eval)]
2714
                self.__higher_better_inner_eval = \
2715
                    [name.startswith(('auc', 'ndcg@', 'map@')) for name in self.__name_inner_eval]
2716

wxchan's avatar
wxchan committed
2717
    def attr(self, key):
2718
        """Get attribute string from the Booster.
wxchan's avatar
wxchan committed
2719
2720
2721

        Parameters
        ----------
2722
2723
        key : string
            The name of the attribute.
wxchan's avatar
wxchan committed
2724
2725
2726

        Returns
        -------
2727
2728
        value : string or None
            The attribute value.
Nikita Titov's avatar
Nikita Titov committed
2729
            Returns None if attribute does not exist.
wxchan's avatar
wxchan committed
2730
        """
2731
        return self.__attr.get(key, None)
wxchan's avatar
wxchan committed
2732
2733

    def set_attr(self, **kwargs):
2734
        """Set attributes to the Booster.
wxchan's avatar
wxchan committed
2735
2736
2737
2738

        Parameters
        ----------
        **kwargs
2739
2740
            The attributes to set.
            Setting a value to None deletes an attribute.
Nikita Titov's avatar
Nikita Titov committed
2741
2742
2743
2744

        Returns
        -------
        self : Booster
2745
            Booster with set attributes.
wxchan's avatar
wxchan committed
2746
2747
2748
        """
        for key, value in kwargs.items():
            if value is not None:
wxchan's avatar
wxchan committed
2749
                if not isinstance(value, string_type):
Nikita Titov's avatar
Nikita Titov committed
2750
                    raise ValueError("Only string values are accepted")
wxchan's avatar
wxchan committed
2751
2752
2753
                self.__attr[key] = value
            else:
                self.__attr.pop(key, None)
Nikita Titov's avatar
Nikita Titov committed
2754
        return self