"include/git@developer.sourcefind.cn:tianlh/lightgbm-dcu.git" did not exist on "dc6995742a5284a1e942978e2542fc49adda9ea1"
basic.py 117 KB
Newer Older
wxchan's avatar
wxchan committed
1
# coding: utf-8
2
"""Wrapper for C API of LightGBM."""
wxchan's avatar
wxchan committed
3
4
from __future__ import absolute_import

5
import copy
wxchan's avatar
wxchan committed
6
import ctypes
7
import os
wxchan's avatar
wxchan committed
8
import warnings
wxchan's avatar
wxchan committed
9
from tempfile import NamedTemporaryFile
10
from collections import OrderedDict
wxchan's avatar
wxchan committed
11
12
13
14

import numpy as np
import scipy.sparse

15
from .compat import (PANDAS_INSTALLED, DataFrame, Series, is_dtype_sparse,
16
                     DataTable,
17
18
                     decode_string, string_type,
                     integer_types, numeric_types,
19
                     json, json_default_with_numpy,
20
                     range_, zip_)
wxchan's avatar
wxchan committed
21
22
from .libpath import find_lib_path

wxchan's avatar
wxchan committed
23

wxchan's avatar
wxchan committed
24
def _load_lib():
25
    """Load LightGBM library."""
wxchan's avatar
wxchan committed
26
27
    lib_path = find_lib_path()
    if len(lib_path) == 0:
28
        return None
wxchan's avatar
wxchan committed
29
30
31
32
    lib = ctypes.cdll.LoadLibrary(lib_path[0])
    lib.LGBM_GetLastError.restype = ctypes.c_char_p
    return lib

wxchan's avatar
wxchan committed
33

wxchan's avatar
wxchan committed
34
35
_LIB = _load_lib()

wxchan's avatar
wxchan committed
36

wxchan's avatar
wxchan committed
37
def _safe_call(ret):
38
39
    """Check the return value from C API call.

wxchan's avatar
wxchan committed
40
41
42
    Parameters
    ----------
    ret : int
43
        The return value from C API calls.
wxchan's avatar
wxchan committed
44
45
    """
    if ret != 0:
46
        raise LightGBMError(decode_string(_LIB.LGBM_GetLastError()))
wxchan's avatar
wxchan committed
47

wxchan's avatar
wxchan committed
48

wxchan's avatar
wxchan committed
49
def is_numeric(obj):
50
    """Check whether object is a number or not, include numpy number, etc."""
wxchan's avatar
wxchan committed
51
52
53
    try:
        float(obj)
        return True
wxchan's avatar
wxchan committed
54
55
56
    except (TypeError, ValueError):
        # TypeError: obj is not a string or a number
        # ValueError: invalid literal
wxchan's avatar
wxchan committed
57
58
        return False

wxchan's avatar
wxchan committed
59

wxchan's avatar
wxchan committed
60
def is_numpy_1d_array(data):
61
    """Check whether data is a numpy 1-D array."""
62
    return isinstance(data, np.ndarray) and len(data.shape) == 1
wxchan's avatar
wxchan committed
63

wxchan's avatar
wxchan committed
64

wxchan's avatar
wxchan committed
65
def is_1d_list(data):
66
67
    """Check whether data is a 1-D list."""
    return isinstance(data, list) and (not data or is_numeric(data[0]))
wxchan's avatar
wxchan committed
68

wxchan's avatar
wxchan committed
69

70
def list_to_1d_numpy(data, dtype=np.float32, name='list'):
71
    """Convert data to numpy 1-D array."""
wxchan's avatar
wxchan committed
72
73
74
75
76
77
78
    if is_numpy_1d_array(data):
        if data.dtype == dtype:
            return data
        else:
            return data.astype(dtype=dtype, copy=False)
    elif is_1d_list(data):
        return np.array(data, dtype=dtype, copy=False)
79
    elif isinstance(data, Series):
80
81
        if _get_bad_pandas_dtypes([data.dtypes]):
            raise ValueError('Series.dtypes must be int, float or bool')
82
        return np.array(data, dtype=dtype, copy=False)  # SparseArray should be supported as well
wxchan's avatar
wxchan committed
83
    else:
84
85
        raise TypeError("Wrong type({0}) for {1}.\n"
                        "It should be list, numpy 1-D array or pandas Series".format(type(data).__name__, name))
wxchan's avatar
wxchan committed
86

wxchan's avatar
wxchan committed
87

wxchan's avatar
wxchan committed
88
def cfloat32_array_to_numpy(cptr, length):
89
    """Convert a ctypes float pointer array to a numpy array."""
wxchan's avatar
wxchan committed
90
    if isinstance(cptr, ctypes.POINTER(ctypes.c_float)):
91
        return np.fromiter(cptr, dtype=np.float32, count=length)
wxchan's avatar
wxchan committed
92
    else:
93
        raise RuntimeError('Expected float pointer')
wxchan's avatar
wxchan committed
94

Guolin Ke's avatar
Guolin Ke committed
95

Guolin Ke's avatar
Guolin Ke committed
96
def cfloat64_array_to_numpy(cptr, length):
97
    """Convert a ctypes double pointer array to a numpy array."""
Guolin Ke's avatar
Guolin Ke committed
98
99
100
101
102
    if isinstance(cptr, ctypes.POINTER(ctypes.c_double)):
        return np.fromiter(cptr, dtype=np.float64, count=length)
    else:
        raise RuntimeError('Expected double pointer')

wxchan's avatar
wxchan committed
103

wxchan's avatar
wxchan committed
104
def cint32_array_to_numpy(cptr, length):
105
    """Convert a ctypes int pointer array to a numpy array."""
wxchan's avatar
wxchan committed
106
    if isinstance(cptr, ctypes.POINTER(ctypes.c_int32)):
107
        return np.fromiter(cptr, dtype=np.int32, count=length)
wxchan's avatar
wxchan committed
108
    else:
109
        raise RuntimeError('Expected int pointer')
wxchan's avatar
wxchan committed
110

wxchan's avatar
wxchan committed
111

wxchan's avatar
wxchan committed
112
def c_str(string):
113
    """Convert a Python string to C string."""
wxchan's avatar
wxchan committed
114
115
    return ctypes.c_char_p(string.encode('utf-8'))

wxchan's avatar
wxchan committed
116

wxchan's avatar
wxchan committed
117
def c_array(ctype, values):
118
    """Convert a Python array to C array."""
wxchan's avatar
wxchan committed
119
120
    return (ctype * len(values))(*values)

wxchan's avatar
wxchan committed
121

wxchan's avatar
wxchan committed
122
def param_dict_to_str(data):
123
    """Convert Python dictionary to string, which is passed to C API."""
124
    if data is None or not data:
wxchan's avatar
wxchan committed
125
126
127
        return ""
    pairs = []
    for key, val in data.items():
128
        if isinstance(val, (list, tuple, set)) or is_numpy_1d_array(val):
wxchan's avatar
wxchan committed
129
            pairs.append(str(key) + '=' + ','.join(map(str, val)))
wxchan's avatar
wxchan committed
130
        elif isinstance(val, string_type) or isinstance(val, numeric_types) or is_numeric(val):
wxchan's avatar
wxchan committed
131
            pairs.append(str(key) + '=' + str(val))
132
        elif val is not None:
133
            raise TypeError('Unknown type of parameter:%s, got:%s'
wxchan's avatar
wxchan committed
134
135
                            % (key, type(val).__name__))
    return ' '.join(pairs)
136

wxchan's avatar
wxchan committed
137

138
class _TempFile(object):
139
140
141
142
    def __enter__(self):
        with NamedTemporaryFile(prefix="lightgbm_tmp_", delete=True) as f:
            self.name = f.name
        return self
wxchan's avatar
wxchan committed
143

144
145
146
    def __exit__(self, exc_type, exc_val, exc_tb):
        if os.path.isfile(self.name):
            os.remove(self.name)
wxchan's avatar
wxchan committed
147

148
149
150
151
    def readlines(self):
        with open(self.name, "r+") as f:
            ret = f.readlines()
        return ret
wxchan's avatar
wxchan committed
152

153
154
    def writelines(self, lines):
        with open(self.name, "w+") as f:
155
            f.writelines(lines)
156

wxchan's avatar
wxchan committed
157

158
class LightGBMError(Exception):
159
160
    """Error thrown by LightGBM."""

161
162
163
    pass


164
class _ConfigAliases(object):
165
166
167
    aliases = {"bin_construct_sample_cnt": {"bin_construct_sample_cnt",
                                            "subsample_for_bin"},
               "boosting": {"boosting",
168
169
170
171
172
173
                            "boosting_type",
                            "boost"},
               "categorical_feature": {"categorical_feature",
                                       "cat_feature",
                                       "categorical_column",
                                       "cat_column"},
174
175
               "data_random_seed": {"data_random_seed",
                                    "data_seed"},
176
177
178
179
               "early_stopping_round": {"early_stopping_round",
                                        "early_stopping_rounds",
                                        "early_stopping",
                                        "n_iter_no_change"},
180
181
182
               "enable_bundle": {"enable_bundle",
                                 "is_enable_bundle",
                                 "bundle"},
183
184
185
186
187
               "eval_at": {"eval_at",
                           "ndcg_eval_at",
                           "ndcg_at",
                           "map_eval_at",
                           "map_at"},
188
189
190
191
192
193
               "group_column": {"group_column",
                                "group",
                                "group_id",
                                "query_column",
                                "query",
                                "query_id"},
194
195
               "header": {"header",
                          "has_header"},
196
197
198
199
200
201
202
203
204
               "ignore_column": {"ignore_column",
                                 "ignore_feature",
                                 "blacklist"},
               "is_enable_sparse": {"is_enable_sparse",
                                    "is_sparse",
                                    "enable_sparse",
                                    "sparse"},
               "label_column": {"label_column",
                                "label"},
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
               "machines": {"machines",
                            "workers",
                            "nodes"},
               "metric": {"metric",
                          "metrics",
                          "metric_types"},
               "num_class": {"num_class",
                             "num_classes"},
               "num_iterations": {"num_iterations",
                                  "num_iteration",
                                  "n_iter",
                                  "num_tree",
                                  "num_trees",
                                  "num_round",
                                  "num_rounds",
                                  "num_boost_round",
                                  "n_estimators"},
               "objective": {"objective",
                             "objective_type",
                             "app",
                             "application"},
226
227
228
229
230
               "pre_partition": {"pre_partition",
                                 "is_pre_partition"},
               "two_round": {"two_round",
                             "two_round_loading",
                             "use_two_round_loading"},
231
               "verbosity": {"verbosity",
232
233
234
                             "verbose"},
               "weight_column": {"weight_column",
                                 "weight"}}
235
236
237
238
239

    @classmethod
    def get(cls, *args):
        ret = set()
        for i in args:
240
            ret |= cls.aliases.get(i, {i})
241
242
243
        return ret


244
245
MAX_INT32 = (1 << 31) - 1

246
"""Macro definition of data type in C API of LightGBM"""
wxchan's avatar
wxchan committed
247
248
249
250
C_API_DTYPE_FLOAT32 = 0
C_API_DTYPE_FLOAT64 = 1
C_API_DTYPE_INT32 = 2
C_API_DTYPE_INT64 = 3
Guolin Ke's avatar
Guolin Ke committed
251

252
"""Matrix is row major in Python"""
wxchan's avatar
wxchan committed
253
254
C_API_IS_ROW_MAJOR = 1

255
"""Macro definition of prediction type in C API of LightGBM"""
wxchan's avatar
wxchan committed
256
257
258
C_API_PREDICT_NORMAL = 0
C_API_PREDICT_RAW_SCORE = 1
C_API_PREDICT_LEAF_INDEX = 2
259
C_API_PREDICT_CONTRIB = 3
wxchan's avatar
wxchan committed
260

261
"""Data type of data field"""
wxchan's avatar
wxchan committed
262
263
FIELD_TYPE_MAPPER = {"label": C_API_DTYPE_FLOAT32,
                     "weight": C_API_DTYPE_FLOAT32,
Guolin Ke's avatar
Guolin Ke committed
264
                     "init_score": C_API_DTYPE_FLOAT64,
265
                     "group": C_API_DTYPE_INT32}
wxchan's avatar
wxchan committed
266

wxchan's avatar
wxchan committed
267

268
def convert_from_sliced_object(data):
269
    """Fix the memory of multi-dimensional sliced object."""
270
    if isinstance(data, np.ndarray) and isinstance(data.base, np.ndarray):
271
        if not data.flags.c_contiguous:
272
273
            warnings.warn("Usage of np.ndarray subset (sliced data) is not recommended "
                          "due to it will double the peak memory cost in LightGBM.")
274
275
276
277
            return np.copy(data)
    return data


wxchan's avatar
wxchan committed
278
def c_float_array(data):
279
    """Get pointer of float numpy array / list."""
wxchan's avatar
wxchan committed
280
281
282
    if is_1d_list(data):
        data = np.array(data, copy=False)
    if is_numpy_1d_array(data):
283
284
        data = convert_from_sliced_object(data)
        assert data.flags.c_contiguous
wxchan's avatar
wxchan committed
285
286
287
288
289
290
291
        if data.dtype == np.float32:
            ptr_data = data.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
            type_data = C_API_DTYPE_FLOAT32
        elif data.dtype == np.float64:
            ptr_data = data.ctypes.data_as(ctypes.POINTER(ctypes.c_double))
            type_data = C_API_DTYPE_FLOAT64
        else:
292
            raise TypeError("Expected np.float32 or np.float64, met type({})"
wxchan's avatar
wxchan committed
293
294
                            .format(data.dtype))
    else:
295
        raise TypeError("Unknown type({})".format(type(data).__name__))
296
    return (ptr_data, type_data, data)  # return `data` to avoid the temporary copy is freed
wxchan's avatar
wxchan committed
297

wxchan's avatar
wxchan committed
298

wxchan's avatar
wxchan committed
299
def c_int_array(data):
300
    """Get pointer of int numpy array / list."""
wxchan's avatar
wxchan committed
301
302
303
    if is_1d_list(data):
        data = np.array(data, copy=False)
    if is_numpy_1d_array(data):
304
305
        data = convert_from_sliced_object(data)
        assert data.flags.c_contiguous
wxchan's avatar
wxchan committed
306
307
308
309
310
311
312
        if data.dtype == np.int32:
            ptr_data = data.ctypes.data_as(ctypes.POINTER(ctypes.c_int32))
            type_data = C_API_DTYPE_INT32
        elif data.dtype == np.int64:
            ptr_data = data.ctypes.data_as(ctypes.POINTER(ctypes.c_int64))
            type_data = C_API_DTYPE_INT64
        else:
313
            raise TypeError("Expected np.int32 or np.int64, met type({})"
wxchan's avatar
wxchan committed
314
315
                            .format(data.dtype))
    else:
316
        raise TypeError("Unknown type({})".format(type(data).__name__))
317
    return (ptr_data, type_data, data)  # return `data` to avoid the temporary copy is freed
wxchan's avatar
wxchan committed
318

wxchan's avatar
wxchan committed
319

320
321
322
323
324
325
326
327
328
329
330
def _get_bad_pandas_dtypes(dtypes):
    pandas_dtype_mapper = {'int8': 'int', 'int16': 'int', 'int32': 'int',
                           'int64': 'int', 'uint8': 'int', 'uint16': 'int',
                           'uint32': 'int', 'uint64': 'int', 'bool': 'int',
                           'float16': 'float', 'float32': 'float', 'float64': 'float'}
    bad_indices = [i for i, dtype in enumerate(dtypes) if (dtype.name not in pandas_dtype_mapper
                                                           and (not is_dtype_sparse(dtype)
                                                                or dtype.subtype.name not in pandas_dtype_mapper))]
    return bad_indices


331
def _data_from_pandas(data, feature_name, categorical_feature, pandas_categorical):
332
    if isinstance(data, DataFrame):
333
334
        if len(data.shape) != 2 or data.shape[0] < 1:
            raise ValueError('Input data must be 2 dimensional and non empty.')
335
336
        if feature_name == 'auto' or feature_name is None:
            data = data.rename(columns=str)
337
338
        cat_cols = list(data.select_dtypes(include=['category']).columns)
        cat_cols_not_ordered = [col for col in cat_cols if not data[col].cat.ordered]
339
340
341
342
343
        if pandas_categorical is None:  # train dataset
            pandas_categorical = [list(data[col].cat.categories) for col in cat_cols]
        else:
            if len(cat_cols) != len(pandas_categorical):
                raise ValueError('train and valid dataset categorical_feature do not match.')
344
            for col, category in zip_(cat_cols, pandas_categorical):
345
346
                if list(data[col].cat.categories) != list(category):
                    data[col] = data[col].cat.set_categories(category)
347
        if len(cat_cols):  # cat_cols is list
348
            data = data.copy()  # not alter origin DataFrame
349
            data[cat_cols] = data[cat_cols].apply(lambda x: x.cat.codes).replace({-1: np.nan})
350
351
352
        if categorical_feature is not None:
            if feature_name is None:
                feature_name = list(data.columns)
353
            if categorical_feature == 'auto':  # use cat cols from DataFrame
354
                categorical_feature = cat_cols_not_ordered
355
356
            else:  # use cat cols specified by user
                categorical_feature = list(categorical_feature)
357
358
        if feature_name == 'auto':
            feature_name = list(data.columns)
359
360
        bad_indices = _get_bad_pandas_dtypes(data.dtypes)
        if bad_indices:
361
            raise ValueError("DataFrame.dtypes for data must be int, float or bool.\n"
362
                             "Did not expect the data types in the following fields: "
363
                             + ', '.join(data.columns[bad_indices]))
364
365
366
        data = data.values
        if data.dtype != np.float32 and data.dtype != np.float64:
            data = data.astype(np.float32)
367
368
369
370
371
372
    else:
        if feature_name == 'auto':
            feature_name = None
        if categorical_feature == 'auto':
            categorical_feature = None
    return data, feature_name, categorical_feature, pandas_categorical
373
374
375
376
377
378


def _label_from_pandas(label):
    if isinstance(label, DataFrame):
        if len(label.columns) > 1:
            raise ValueError('DataFrame for label cannot have multiple columns')
379
        if _get_bad_pandas_dtypes(label.dtypes):
380
            raise ValueError('DataFrame.dtypes for label must be int, float or bool')
381
        label = np.ravel(label.values.astype(np.float32, copy=False))
382
383
384
    return label


385
386
387
388
389
390
391
392
393
394
395
def _dump_pandas_categorical(pandas_categorical, file_name=None):
    pandas_str = ('\npandas_categorical:'
                  + json.dumps(pandas_categorical, default=json_default_with_numpy)
                  + '\n')
    if file_name is not None:
        with open(file_name, 'a') as f:
            f.write(pandas_str)
    return pandas_str


def _load_pandas_categorical(file_name=None, model_str=None):
396
397
    pandas_key = 'pandas_categorical:'
    offset = -len(pandas_key)
398
    if file_name is not None:
399
400
401
402
403
404
405
406
407
408
409
410
411
        max_offset = -os.path.getsize(file_name)
        with open(file_name, 'rb') as f:
            while True:
                if offset < max_offset:
                    offset = max_offset
                f.seek(offset, os.SEEK_END)
                lines = f.readlines()
                if len(lines) >= 2:
                    break
                offset *= 2
        last_line = decode_string(lines[-1]).strip()
        if not last_line.startswith(pandas_key):
            last_line = decode_string(lines[-2]).strip()
412
    elif model_str is not None:
413
414
415
416
417
418
        idx = model_str.rfind('\n', 0, offset)
        last_line = model_str[idx:].strip()
    if last_line.startswith(pandas_key):
        return json.loads(last_line[len(pandas_key):])
    else:
        return None
419
420


Guolin Ke's avatar
Guolin Ke committed
421
class _InnerPredictor(object):
422
423
424
425
426
    """_InnerPredictor of LightGBM.

    Not exposed to user.
    Used only for prediction, usually used for continued training.

Nikita Titov's avatar
Nikita Titov committed
427
428
429
    .. note::

        Can be converted from Booster, but cannot be converted to Booster.
Guolin Ke's avatar
Guolin Ke committed
430
    """
431

432
    def __init__(self, model_file=None, booster_handle=None, pred_parameter=None):
433
        """Initialize the _InnerPredictor.
wxchan's avatar
wxchan committed
434
435
436

        Parameters
        ----------
437
        model_file : string or None, optional (default=None)
wxchan's avatar
wxchan committed
438
            Path to the model file.
439
440
441
442
        booster_handle : object or None, optional (default=None)
            Handle of Booster.
        pred_parameter: dict or None, optional (default=None)
            Other parameters for the prediciton.
wxchan's avatar
wxchan committed
443
444
445
446
447
        """
        self.handle = ctypes.c_void_p()
        self.__is_manage_handle = True
        if model_file is not None:
            """Prediction task"""
Guolin Ke's avatar
Guolin Ke committed
448
            out_num_iterations = ctypes.c_int(0)
wxchan's avatar
wxchan committed
449
450
451
452
            _safe_call(_LIB.LGBM_BoosterCreateFromModelfile(
                c_str(model_file),
                ctypes.byref(out_num_iterations),
                ctypes.byref(self.handle)))
Guolin Ke's avatar
Guolin Ke committed
453
            out_num_class = ctypes.c_int(0)
wxchan's avatar
wxchan committed
454
455
456
457
            _safe_call(_LIB.LGBM_BoosterGetNumClasses(
                self.handle,
                ctypes.byref(out_num_class)))
            self.num_class = out_num_class.value
458
            self.num_total_iteration = out_num_iterations.value
459
            self.pandas_categorical = _load_pandas_categorical(file_name=model_file)
wxchan's avatar
wxchan committed
460
        elif booster_handle is not None:
Guolin Ke's avatar
Guolin Ke committed
461
            self.__is_manage_handle = False
wxchan's avatar
wxchan committed
462
            self.handle = booster_handle
Guolin Ke's avatar
Guolin Ke committed
463
            out_num_class = ctypes.c_int(0)
wxchan's avatar
wxchan committed
464
465
466
467
            _safe_call(_LIB.LGBM_BoosterGetNumClasses(
                self.handle,
                ctypes.byref(out_num_class)))
            self.num_class = out_num_class.value
Guolin Ke's avatar
Guolin Ke committed
468
            out_num_iterations = ctypes.c_int(0)
wxchan's avatar
wxchan committed
469
470
471
            _safe_call(_LIB.LGBM_BoosterGetCurrentIteration(
                self.handle,
                ctypes.byref(out_num_iterations)))
472
            self.num_total_iteration = out_num_iterations.value
473
            self.pandas_categorical = None
wxchan's avatar
wxchan committed
474
        else:
475
            raise TypeError('Need model_file or booster_handle to create a predictor')
wxchan's avatar
wxchan committed
476

477
478
        pred_parameter = {} if pred_parameter is None else pred_parameter
        self.pred_parameter = param_dict_to_str(pred_parameter)
cbecker's avatar
cbecker committed
479

wxchan's avatar
wxchan committed
480
    def __del__(self):
481
482
483
484
485
        try:
            if self.__is_manage_handle:
                _safe_call(_LIB.LGBM_BoosterFree(self.handle))
        except AttributeError:
            pass
wxchan's avatar
wxchan committed
486

487
488
489
490
491
    def __getstate__(self):
        this = self.__dict__.copy()
        this.pop('handle', None)
        return this

wxchan's avatar
wxchan committed
492
    def predict(self, data, num_iteration=-1,
493
                raw_score=False, pred_leaf=False, pred_contrib=False, data_has_header=False,
wxchan's avatar
wxchan committed
494
                is_reshape=True):
495
        """Predict logic.
wxchan's avatar
wxchan committed
496
497
498

        Parameters
        ----------
499
        data : string, numpy array, pandas DataFrame, H2O DataTable's Frame or scipy.sparse
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
            Data source for prediction.
            When data type is string, it represents the path of txt file.
        num_iteration : int, optional (default=-1)
            Iteration used for prediction.
        raw_score : bool, optional (default=False)
            Whether to predict raw scores.
        pred_leaf : bool, optional (default=False)
            Whether to predict leaf index.
        pred_contrib : bool, optional (default=False)
            Whether to predict feature contributions.
        data_has_header : bool, optional (default=False)
            Whether data has header.
            Used only for txt data.
        is_reshape : bool, optional (default=True)
            Whether to reshape to (nrow, ncol).
wxchan's avatar
wxchan committed
515
516
517

        Returns
        -------
518
519
        result : numpy array
            Prediction result.
wxchan's avatar
wxchan committed
520
        """
wxchan's avatar
wxchan committed
521
        if isinstance(data, Dataset):
522
            raise TypeError("Cannot use Dataset instance for prediction, please use raw data instead")
523
        data = _data_from_pandas(data, None, None, self.pandas_categorical)[0]
wxchan's avatar
wxchan committed
524
525
526
527
528
        predict_type = C_API_PREDICT_NORMAL
        if raw_score:
            predict_type = C_API_PREDICT_RAW_SCORE
        if pred_leaf:
            predict_type = C_API_PREDICT_LEAF_INDEX
529
530
        if pred_contrib:
            predict_type = C_API_PREDICT_CONTRIB
wxchan's avatar
wxchan committed
531
        int_data_has_header = 1 if data_has_header else 0
532
533
        if num_iteration > self.num_total_iteration:
            num_iteration = self.num_total_iteration
cbecker's avatar
cbecker committed
534

wxchan's avatar
wxchan committed
535
        if isinstance(data, string_type):
536
            with _TempFile() as f:
wxchan's avatar
wxchan committed
537
538
539
                _safe_call(_LIB.LGBM_BoosterPredictForFile(
                    self.handle,
                    c_str(data),
Guolin Ke's avatar
Guolin Ke committed
540
541
542
                    ctypes.c_int(int_data_has_header),
                    ctypes.c_int(predict_type),
                    ctypes.c_int(num_iteration),
543
                    c_str(self.pred_parameter),
wxchan's avatar
wxchan committed
544
545
                    c_str(f.name)))
                lines = f.readlines()
546
547
                nrow = len(lines)
                preds = [float(token) for line in lines for token in line.split('\t')]
Guolin Ke's avatar
Guolin Ke committed
548
                preds = np.array(preds, dtype=np.float64, copy=False)
wxchan's avatar
wxchan committed
549
        elif isinstance(data, scipy.sparse.csr_matrix):
550
            preds, nrow = self.__pred_for_csr(data, num_iteration, predict_type)
Guolin Ke's avatar
Guolin Ke committed
551
        elif isinstance(data, scipy.sparse.csc_matrix):
552
            preds, nrow = self.__pred_for_csc(data, num_iteration, predict_type)
wxchan's avatar
wxchan committed
553
        elif isinstance(data, np.ndarray):
554
            preds, nrow = self.__pred_for_np2d(data, num_iteration, predict_type)
555
556
557
        elif isinstance(data, list):
            try:
                data = np.array(data)
558
            except BaseException:
559
                raise ValueError('Cannot convert data list to numpy array.')
560
            preds, nrow = self.__pred_for_np2d(data, num_iteration, predict_type)
561
562
        elif isinstance(data, DataTable):
            preds, nrow = self.__pred_for_np2d(data.to_numpy(), num_iteration, predict_type)
wxchan's avatar
wxchan committed
563
564
        else:
            try:
565
                warnings.warn('Converting data to scipy sparse matrix.')
wxchan's avatar
wxchan committed
566
                csr = scipy.sparse.csr_matrix(data)
567
            except BaseException:
568
                raise TypeError('Cannot predict data for type {}'.format(type(data).__name__))
569
            preds, nrow = self.__pred_for_csr(csr, num_iteration, predict_type)
wxchan's avatar
wxchan committed
570
571
        if pred_leaf:
            preds = preds.astype(np.int32)
572
        if is_reshape and preds.size != nrow:
wxchan's avatar
wxchan committed
573
            if preds.size % nrow == 0:
574
                preds = preds.reshape(nrow, -1)
wxchan's avatar
wxchan committed
575
            else:
576
                raise ValueError('Length of predict result (%d) cannot be divide nrow (%d)'
wxchan's avatar
wxchan committed
577
578
579
580
                                 % (preds.size, nrow))
        return preds

    def __get_num_preds(self, num_iteration, nrow, predict_type):
581
        """Get size of prediction result."""
582
583
584
585
586
        if nrow > MAX_INT32:
            raise LightGBMError('LightGBM cannot perform prediction for data'
                                'with number of rows greater than MAX_INT32 (%d).\n'
                                'You can split your data into chunks'
                                'and then concatenate predictions for them' % MAX_INT32)
Guolin Ke's avatar
Guolin Ke committed
587
588
589
        n_preds = ctypes.c_int64(0)
        _safe_call(_LIB.LGBM_BoosterCalcNumPredict(
            self.handle,
Guolin Ke's avatar
Guolin Ke committed
590
591
592
            ctypes.c_int(nrow),
            ctypes.c_int(predict_type),
            ctypes.c_int(num_iteration),
Guolin Ke's avatar
Guolin Ke committed
593
594
            ctypes.byref(n_preds)))
        return n_preds.value
wxchan's avatar
wxchan committed
595
596

    def __pred_for_np2d(self, mat, num_iteration, predict_type):
597
        """Predict for a 2-D numpy matrix."""
wxchan's avatar
wxchan committed
598
        if len(mat.shape) != 2:
599
            raise ValueError('Input numpy.ndarray or list must be 2 dimensional')
wxchan's avatar
wxchan committed
600

601
602
603
        def inner_predict(mat, num_iteration, predict_type, preds=None):
            if mat.dtype == np.float32 or mat.dtype == np.float64:
                data = np.array(mat.reshape(mat.size), dtype=mat.dtype, copy=False)
604
            else:  # change non-float data to float data, need to copy
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
                data = np.array(mat.reshape(mat.size), dtype=np.float32)
            ptr_data, type_ptr_data, _ = c_float_array(data)
            n_preds = self.__get_num_preds(num_iteration, mat.shape[0], predict_type)
            if preds is None:
                preds = np.zeros(n_preds, dtype=np.float64)
            elif len(preds.shape) != 1 or len(preds) != n_preds:
                raise ValueError("Wrong length of pre-allocated predict array")
            out_num_preds = ctypes.c_int64(0)
            _safe_call(_LIB.LGBM_BoosterPredictForMat(
                self.handle,
                ptr_data,
                ctypes.c_int(type_ptr_data),
                ctypes.c_int(mat.shape[0]),
                ctypes.c_int(mat.shape[1]),
                ctypes.c_int(C_API_IS_ROW_MAJOR),
                ctypes.c_int(predict_type),
                ctypes.c_int(num_iteration),
                c_str(self.pred_parameter),
                ctypes.byref(out_num_preds),
                preds.ctypes.data_as(ctypes.POINTER(ctypes.c_double))))
            if n_preds != out_num_preds.value:
                raise ValueError("Wrong length for predict results")
            return preds, mat.shape[0]

        nrow = mat.shape[0]
        if nrow > MAX_INT32:
            sections = np.arange(start=MAX_INT32, stop=nrow, step=MAX_INT32)
            # __get_num_preds() cannot work with nrow > MAX_INT32, so calculate overall number of predictions piecemeal
            n_preds = [self.__get_num_preds(num_iteration, i, predict_type) for i in np.diff([0] + list(sections) + [nrow])]
            n_preds_sections = np.array([0] + n_preds, dtype=np.intp).cumsum()
            preds = np.zeros(sum(n_preds), dtype=np.float64)
636
637
            for chunk, (start_idx_pred, end_idx_pred) in zip_(np.array_split(mat, sections),
                                                              zip_(n_preds_sections, n_preds_sections[1:])):
638
639
640
                # avoid memory consumption by arrays concatenation operations
                inner_predict(chunk, num_iteration, predict_type, preds[start_idx_pred:end_idx_pred])
            return preds, nrow
wxchan's avatar
wxchan committed
641
        else:
642
            return inner_predict(mat, num_iteration, predict_type)
wxchan's avatar
wxchan committed
643
644

    def __pred_for_csr(self, csr, num_iteration, predict_type):
645
        """Predict for a CSR data."""
646
647
648
649
650
651
652
653
654
655
656
657
        def inner_predict(csr, num_iteration, predict_type, preds=None):
            nrow = len(csr.indptr) - 1
            n_preds = self.__get_num_preds(num_iteration, nrow, predict_type)
            if preds is None:
                preds = np.zeros(n_preds, dtype=np.float64)
            elif len(preds.shape) != 1 or len(preds) != n_preds:
                raise ValueError("Wrong length of pre-allocated predict array")
            out_num_preds = ctypes.c_int64(0)

            ptr_indptr, type_ptr_indptr, __ = c_int_array(csr.indptr)
            ptr_data, type_ptr_data, _ = c_float_array(csr.data)

658
659
660
            assert csr.shape[1] <= MAX_INT32
            csr.indices = csr.indices.astype(np.int32, copy=False)

661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
            _safe_call(_LIB.LGBM_BoosterPredictForCSR(
                self.handle,
                ptr_indptr,
                ctypes.c_int32(type_ptr_indptr),
                csr.indices.ctypes.data_as(ctypes.POINTER(ctypes.c_int32)),
                ptr_data,
                ctypes.c_int(type_ptr_data),
                ctypes.c_int64(len(csr.indptr)),
                ctypes.c_int64(len(csr.data)),
                ctypes.c_int64(csr.shape[1]),
                ctypes.c_int(predict_type),
                ctypes.c_int(num_iteration),
                c_str(self.pred_parameter),
                ctypes.byref(out_num_preds),
                preds.ctypes.data_as(ctypes.POINTER(ctypes.c_double))))
            if n_preds != out_num_preds.value:
                raise ValueError("Wrong length for predict results")
            return preds, nrow
wxchan's avatar
wxchan committed
679

680
681
682
683
684
685
686
687
688
689
690
691
692
693
        nrow = len(csr.indptr) - 1
        if nrow > MAX_INT32:
            sections = [0] + list(np.arange(start=MAX_INT32, stop=nrow, step=MAX_INT32)) + [nrow]
            # __get_num_preds() cannot work with nrow > MAX_INT32, so calculate overall number of predictions piecemeal
            n_preds = [self.__get_num_preds(num_iteration, i, predict_type) for i in np.diff(sections)]
            n_preds_sections = np.array([0] + n_preds, dtype=np.intp).cumsum()
            preds = np.zeros(sum(n_preds), dtype=np.float64)
            for (start_idx, end_idx), (start_idx_pred, end_idx_pred) in zip_(zip_(sections, sections[1:]),
                                                                             zip_(n_preds_sections, n_preds_sections[1:])):
                # avoid memory consumption by arrays concatenation operations
                inner_predict(csr[start_idx:end_idx], num_iteration, predict_type, preds[start_idx_pred:end_idx_pred])
            return preds, nrow
        else:
            return inner_predict(csr, num_iteration, predict_type)
Guolin Ke's avatar
Guolin Ke committed
694
695

    def __pred_for_csc(self, csc, num_iteration, predict_type):
696
        """Predict for a CSC data."""
Guolin Ke's avatar
Guolin Ke committed
697
        nrow = csc.shape[0]
698
699
        if nrow > MAX_INT32:
            return self.__pred_for_csr(csc.tocsr(), num_iteration, predict_type)
Guolin Ke's avatar
Guolin Ke committed
700
701
702
703
        n_preds = self.__get_num_preds(num_iteration, nrow, predict_type)
        preds = np.zeros(n_preds, dtype=np.float64)
        out_num_preds = ctypes.c_int64(0)

704
705
        ptr_indptr, type_ptr_indptr, __ = c_int_array(csc.indptr)
        ptr_data, type_ptr_data, _ = c_float_array(csc.data)
Guolin Ke's avatar
Guolin Ke committed
706

707
708
709
        assert csc.shape[0] <= MAX_INT32
        csc.indices = csc.indices.astype(np.int32, copy=False)

Guolin Ke's avatar
Guolin Ke committed
710
711
712
        _safe_call(_LIB.LGBM_BoosterPredictForCSC(
            self.handle,
            ptr_indptr,
Guolin Ke's avatar
Guolin Ke committed
713
            ctypes.c_int32(type_ptr_indptr),
Guolin Ke's avatar
Guolin Ke committed
714
715
            csc.indices.ctypes.data_as(ctypes.POINTER(ctypes.c_int32)),
            ptr_data,
Guolin Ke's avatar
Guolin Ke committed
716
717
718
719
720
721
            ctypes.c_int(type_ptr_data),
            ctypes.c_int64(len(csc.indptr)),
            ctypes.c_int64(len(csc.data)),
            ctypes.c_int64(csc.shape[0]),
            ctypes.c_int(predict_type),
            ctypes.c_int(num_iteration),
722
            c_str(self.pred_parameter),
Guolin Ke's avatar
Guolin Ke committed
723
            ctypes.byref(out_num_preds),
wxchan's avatar
wxchan committed
724
            preds.ctypes.data_as(ctypes.POINTER(ctypes.c_double))))
wxchan's avatar
wxchan committed
725
        if n_preds != out_num_preds.value:
726
            raise ValueError("Wrong length for predict results")
wxchan's avatar
wxchan committed
727
728
        return preds, nrow

wxchan's avatar
wxchan committed
729

wxchan's avatar
wxchan committed
730
731
class Dataset(object):
    """Dataset in LightGBM."""
732

733
    def __init__(self, data, label=None, reference=None,
734
                 weight=None, group=None, init_score=None, silent=False,
735
                 feature_name='auto', categorical_feature='auto', params=None,
wxchan's avatar
wxchan committed
736
                 free_raw_data=True):
737
        """Initialize Dataset.
738

wxchan's avatar
wxchan committed
739
740
        Parameters
        ----------
741
        data : string, numpy array, pandas DataFrame, H2O DataTable's Frame, scipy.sparse or list of numpy arrays
wxchan's avatar
wxchan committed
742
            Data source of Dataset.
743
            If string, it represents the path to txt file.
744
        label : list, numpy 1-D array, pandas Series / one-column DataFrame or None, optional (default=None)
745
746
747
            Label of the data.
        reference : Dataset or None, optional (default=None)
            If this is Dataset for validation, training data should be used as reference.
748
        weight : list, numpy 1-D array, pandas Series or None, optional (default=None)
wxchan's avatar
wxchan committed
749
            Weight for each instance.
750
        group : list, numpy 1-D array, pandas Series or None, optional (default=None)
751
            Group/query size for Dataset.
752
        init_score : list, numpy 1-D array, pandas Series or None, optional (default=None)
753
            Init score for Dataset.
754
755
756
757
758
759
760
761
762
        silent : bool, optional (default=False)
            Whether to print messages during construction.
        feature_name : list of strings or 'auto', optional (default="auto")
            Feature names.
            If 'auto' and data is pandas DataFrame, data columns names are used.
        categorical_feature : list of strings or int, or 'auto', optional (default="auto")
            Categorical features.
            If list of int, interpreted as indices.
            If list of strings, interpreted as feature names (need to specify ``feature_name`` as well).
763
            If 'auto' and data is pandas DataFrame, pandas unordered categorical columns are used.
764
            All values in categorical features should be less than int32 max value (2147483647).
765
            Large values could be memory consuming. Consider using consecutive integers starting from zero.
766
            All negative values in categorical features will be treated as missing values.
767
            The output cannot be monotonically constrained with respect to a categorical feature.
Nikita Titov's avatar
Nikita Titov committed
768
        params : dict or None, optional (default=None)
769
            Other parameters for Dataset.
Nikita Titov's avatar
Nikita Titov committed
770
        free_raw_data : bool, optional (default=True)
771
            If True, raw data is freed after constructing inner Dataset.
wxchan's avatar
wxchan committed
772
        """
wxchan's avatar
wxchan committed
773
774
775
776
777
778
        self.handle = None
        self.data = data
        self.label = label
        self.reference = reference
        self.weight = weight
        self.group = group
779
        self.init_score = init_score
wxchan's avatar
wxchan committed
780
781
        self.silent = silent
        self.feature_name = feature_name
782
        self.categorical_feature = categorical_feature
783
        self.params = copy.deepcopy(params)
wxchan's avatar
wxchan committed
784
785
        self.free_raw_data = free_raw_data
        self.used_indices = None
786
        self.need_slice = True
wxchan's avatar
wxchan committed
787
        self._predictor = None
788
        self.pandas_categorical = None
789
        self.params_back_up = None
790
791
        self.feature_penalty = None
        self.monotone_constraints = None
792
        self.version = 0
wxchan's avatar
wxchan committed
793
794

    def __del__(self):
795
796
797
798
        try:
            self._free_handle()
        except AttributeError:
            pass
799

800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
    def get_params(self):
        """Get the used parameters in the Dataset.

        Returns
        -------
        params : dict or None
            The used parameters in this Dataset object.
        """
        if self.params is not None:
            # no min_data, nthreads and verbose in this function
            dataset_params = _ConfigAliases.get("bin_construct_sample_cnt",
                                                "categorical_feature",
                                                "data_random_seed",
                                                "enable_bundle",
                                                "feature_pre_filter",
                                                "forcedbins_filename",
                                                "group_column",
                                                "header",
                                                "ignore_column",
                                                "is_enable_sparse",
                                                "label_column",
                                                "max_bin",
                                                "max_bin_by_feature",
                                                "min_data_in_bin",
                                                "pre_partition",
                                                "two_round",
                                                "use_missing",
                                                "weight_column",
                                                "zero_as_missing")
            return {k: v for k, v in self.params.items() if k in dataset_params}

831
    def _free_handle(self):
832
        if self.handle is not None:
833
            _safe_call(_LIB.LGBM_DatasetFree(self.handle))
834
            self.handle = None
Guolin Ke's avatar
Guolin Ke committed
835
836
837
        self.need_slice = True
        if self.used_indices is not None:
            self.data = None
Nikita Titov's avatar
Nikita Titov committed
838
        return self
wxchan's avatar
wxchan committed
839

Guolin Ke's avatar
Guolin Ke committed
840
841
842
843
    def _set_init_score_by_predictor(self, predictor, data, used_indices=None):
        data_has_header = False
        if isinstance(data, string_type):
            # check data has header or not
844
            data_has_header = any(self.params.get(alias, False) for alias in _ConfigAliases.get("header"))
Guolin Ke's avatar
Guolin Ke committed
845
846
847
848
849
850
851
852
853
854
855
856
        init_score = predictor.predict(data,
                                       raw_score=True,
                                       data_has_header=data_has_header,
                                       is_reshape=False)
        num_data = self.num_data()
        if used_indices is not None:
            assert not self.need_slice
            if isinstance(data, string_type):
                sub_init_score = np.zeros(num_data * predictor.num_class, dtype=np.float32)
                assert num_data == len(used_indices)
                for i in range_(len(used_indices)):
                    for j in range_(predictor.num_class):
857
                        sub_init_score[i * predictor.num_class + j] = init_score[used_indices[i] * predictor.num_class + j]
Guolin Ke's avatar
Guolin Ke committed
858
859
860
861
862
863
864
865
866
867
                init_score = sub_init_score
        if predictor.num_class > 1:
            # need to regroup init_score
            new_init_score = np.zeros(init_score.size, dtype=np.float32)
            for i in range_(num_data):
                for j in range_(predictor.num_class):
                    new_init_score[j * num_data + i] = init_score[i * predictor.num_class + j]
            init_score = new_init_score
        self.set_init_score(init_score)

868
    def _lazy_init(self, data, label=None, reference=None,
869
                   weight=None, group=None, init_score=None, predictor=None,
wxchan's avatar
wxchan committed
870
                   silent=False, feature_name='auto',
871
                   categorical_feature='auto', params=None):
wxchan's avatar
wxchan committed
872
873
        if data is None:
            self.handle = None
Nikita Titov's avatar
Nikita Titov committed
874
            return self
Guolin Ke's avatar
Guolin Ke committed
875
876
877
        if reference is not None:
            self.pandas_categorical = reference.pandas_categorical
            categorical_feature = reference.categorical_feature
878
879
880
881
        data, feature_name, categorical_feature, self.pandas_categorical = _data_from_pandas(data,
                                                                                             feature_name,
                                                                                             categorical_feature,
                                                                                             self.pandas_categorical)
wxchan's avatar
wxchan committed
882
        label = _label_from_pandas(label)
Guolin Ke's avatar
Guolin Ke committed
883

884
        # process for args
wxchan's avatar
wxchan committed
885
        params = {} if params is None else params
886
887
888
        args_names = (getattr(self.__class__, '_lazy_init')
                      .__code__
                      .co_varnames[:getattr(self.__class__, '_lazy_init').__code__.co_argcount])
889
890
        for key, _ in params.items():
            if key in args_names:
891
892
893
                warnings.warn('{0} keyword has been found in `params` and will be ignored.\n'
                              'Please use {0} argument of the Dataset constructor to pass this parameter.'
                              .format(key))
894
        # user can set verbose with params, it has higher priority
895
        if not any(verbose_alias in params for verbose_alias in _ConfigAliases.get("verbosity")) and silent:
896
            params["verbose"] = -1
897
        # get categorical features
898
899
900
901
902
903
904
905
906
907
908
909
910
        if categorical_feature is not None:
            categorical_indices = set()
            feature_dict = {}
            if feature_name is not None:
                feature_dict = {name: i for i, name in enumerate(feature_name)}
            for name in categorical_feature:
                if isinstance(name, string_type) and name in feature_dict:
                    categorical_indices.add(feature_dict[name])
                elif isinstance(name, integer_types):
                    categorical_indices.add(name)
                else:
                    raise TypeError("Wrong type({}) or unknown name({}) in categorical_feature"
                                    .format(type(name).__name__, name))
911
            if categorical_indices:
912
913
914
915
                for cat_alias in _ConfigAliases.get("categorical_feature"):
                    if cat_alias in params:
                        warnings.warn('{} in param dict is overridden.'.format(cat_alias))
                        params.pop(cat_alias, None)
916
                params['categorical_column'] = sorted(categorical_indices)
917

wxchan's avatar
wxchan committed
918
        params_str = param_dict_to_str(params)
919
        self.params = params
920
        # process for reference dataset
wxchan's avatar
wxchan committed
921
        ref_dataset = None
wxchan's avatar
wxchan committed
922
        if isinstance(reference, Dataset):
923
            ref_dataset = reference.construct().handle
wxchan's avatar
wxchan committed
924
925
        elif reference is not None:
            raise TypeError('Reference dataset should be None or dataset instance')
926
        # start construct data
wxchan's avatar
wxchan committed
927
        if isinstance(data, string_type):
wxchan's avatar
wxchan committed
928
929
930
931
932
933
934
935
            self.handle = ctypes.c_void_p()
            _safe_call(_LIB.LGBM_DatasetCreateFromFile(
                c_str(data),
                c_str(params_str),
                ref_dataset,
                ctypes.byref(self.handle)))
        elif isinstance(data, scipy.sparse.csr_matrix):
            self.__init_from_csr(data, params_str, ref_dataset)
Guolin Ke's avatar
Guolin Ke committed
936
937
        elif isinstance(data, scipy.sparse.csc_matrix):
            self.__init_from_csc(data, params_str, ref_dataset)
wxchan's avatar
wxchan committed
938
939
        elif isinstance(data, np.ndarray):
            self.__init_from_np2d(data, params_str, ref_dataset)
940
941
        elif isinstance(data, list) and len(data) > 0 and all(isinstance(x, np.ndarray) for x in data):
            self.__init_from_list_np2d(data, params_str, ref_dataset)
942
943
        elif isinstance(data, DataTable):
            self.__init_from_np2d(data.to_numpy(), params_str, ref_dataset)
wxchan's avatar
wxchan committed
944
945
946
947
        else:
            try:
                csr = scipy.sparse.csr_matrix(data)
                self.__init_from_csr(csr, params_str, ref_dataset)
948
            except BaseException:
wxchan's avatar
wxchan committed
949
                raise TypeError('Cannot initialize Dataset from {}'.format(type(data).__name__))
wxchan's avatar
wxchan committed
950
951
952
        if label is not None:
            self.set_label(label)
        if self.get_label() is None:
953
            raise ValueError("Label should not be None")
wxchan's avatar
wxchan committed
954
955
956
957
        if weight is not None:
            self.set_weight(weight)
        if group is not None:
            self.set_group(group)
958
959
960
        if isinstance(predictor, _InnerPredictor):
            if self._predictor is None and init_score is not None:
                warnings.warn("The init_score will be overridden by the prediction of init_model.")
Guolin Ke's avatar
Guolin Ke committed
961
            self._set_init_score_by_predictor(predictor, data)
962
963
        elif init_score is not None:
            self.set_init_score(init_score)
Guolin Ke's avatar
Guolin Ke committed
964
965
        elif predictor is not None:
            raise TypeError('Wrong predictor type {}'.format(type(predictor).__name__))
Guolin Ke's avatar
Guolin Ke committed
966
        # set feature names
Nikita Titov's avatar
Nikita Titov committed
967
        return self.set_feature_name(feature_name)
wxchan's avatar
wxchan committed
968
969

    def __init_from_np2d(self, mat, params_str, ref_dataset):
970
        """Initialize data from a 2-D numpy matrix."""
wxchan's avatar
wxchan committed
971
972
973
974
975
976
        if len(mat.shape) != 2:
            raise ValueError('Input numpy.ndarray must be 2 dimensional')

        self.handle = ctypes.c_void_p()
        if mat.dtype == np.float32 or mat.dtype == np.float64:
            data = np.array(mat.reshape(mat.size), dtype=mat.dtype, copy=False)
977
        else:  # change non-float data to float data, need to copy
wxchan's avatar
wxchan committed
978
979
            data = np.array(mat.reshape(mat.size), dtype=np.float32)

980
        ptr_data, type_ptr_data, _ = c_float_array(data)
wxchan's avatar
wxchan committed
981
982
        _safe_call(_LIB.LGBM_DatasetCreateFromMat(
            ptr_data,
Guolin Ke's avatar
Guolin Ke committed
983
984
985
986
            ctypes.c_int(type_ptr_data),
            ctypes.c_int(mat.shape[0]),
            ctypes.c_int(mat.shape[1]),
            ctypes.c_int(C_API_IS_ROW_MAJOR),
wxchan's avatar
wxchan committed
987
988
989
            c_str(params_str),
            ref_dataset,
            ctypes.byref(self.handle)))
Nikita Titov's avatar
Nikita Titov committed
990
        return self
wxchan's avatar
wxchan committed
991

992
    def __init_from_list_np2d(self, mats, params_str, ref_dataset):
993
        """Initialize data from a list of 2-D numpy matrices."""
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
        ncol = mats[0].shape[1]
        nrow = np.zeros((len(mats),), np.int32)
        if mats[0].dtype == np.float64:
            ptr_data = (ctypes.POINTER(ctypes.c_double) * len(mats))()
        else:
            ptr_data = (ctypes.POINTER(ctypes.c_float) * len(mats))()

        holders = []
        type_ptr_data = None

        for i, mat in enumerate(mats):
            if len(mat.shape) != 2:
                raise ValueError('Input numpy.ndarray must be 2 dimensional')

            if mat.shape[1] != ncol:
                raise ValueError('Input arrays must have same number of columns')

            nrow[i] = mat.shape[0]

            if mat.dtype == np.float32 or mat.dtype == np.float64:
                mats[i] = np.array(mat.reshape(mat.size), dtype=mat.dtype, copy=False)
1015
            else:  # change non-float data to float data, need to copy
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
                mats[i] = np.array(mat.reshape(mat.size), dtype=np.float32)

            chunk_ptr_data, chunk_type_ptr_data, holder = c_float_array(mats[i])
            if type_ptr_data is not None and chunk_type_ptr_data != type_ptr_data:
                raise ValueError('Input chunks must have same type')
            ptr_data[i] = chunk_ptr_data
            type_ptr_data = chunk_type_ptr_data
            holders.append(holder)

        self.handle = ctypes.c_void_p()
        _safe_call(_LIB.LGBM_DatasetCreateFromMats(
            ctypes.c_int(len(mats)),
            ctypes.cast(ptr_data, ctypes.POINTER(ctypes.POINTER(ctypes.c_double))),
            ctypes.c_int(type_ptr_data),
            nrow.ctypes.data_as(ctypes.POINTER(ctypes.c_int32)),
            ctypes.c_int(ncol),
            ctypes.c_int(C_API_IS_ROW_MAJOR),
            c_str(params_str),
            ref_dataset,
            ctypes.byref(self.handle)))
Nikita Titov's avatar
Nikita Titov committed
1036
        return self
1037

wxchan's avatar
wxchan committed
1038
    def __init_from_csr(self, csr, params_str, ref_dataset):
1039
        """Initialize data from a CSR matrix."""
wxchan's avatar
wxchan committed
1040
        if len(csr.indices) != len(csr.data):
1041
            raise ValueError('Length mismatch: {} vs {}'.format(len(csr.indices), len(csr.data)))
wxchan's avatar
wxchan committed
1042
1043
        self.handle = ctypes.c_void_p()

1044
1045
        ptr_indptr, type_ptr_indptr, __ = c_int_array(csr.indptr)
        ptr_data, type_ptr_data, _ = c_float_array(csr.data)
wxchan's avatar
wxchan committed
1046

1047
1048
1049
        assert csr.shape[1] <= MAX_INT32
        csr.indices = csr.indices.astype(np.int32, copy=False)

wxchan's avatar
wxchan committed
1050
1051
        _safe_call(_LIB.LGBM_DatasetCreateFromCSR(
            ptr_indptr,
Guolin Ke's avatar
Guolin Ke committed
1052
            ctypes.c_int(type_ptr_indptr),
wxchan's avatar
wxchan committed
1053
1054
            csr.indices.ctypes.data_as(ctypes.POINTER(ctypes.c_int32)),
            ptr_data,
Guolin Ke's avatar
Guolin Ke committed
1055
1056
1057
1058
            ctypes.c_int(type_ptr_data),
            ctypes.c_int64(len(csr.indptr)),
            ctypes.c_int64(len(csr.data)),
            ctypes.c_int64(csr.shape[1]),
wxchan's avatar
wxchan committed
1059
1060
1061
            c_str(params_str),
            ref_dataset,
            ctypes.byref(self.handle)))
Nikita Titov's avatar
Nikita Titov committed
1062
        return self
wxchan's avatar
wxchan committed
1063

Guolin Ke's avatar
Guolin Ke committed
1064
    def __init_from_csc(self, csc, params_str, ref_dataset):
1065
        """Initialize data from a CSC matrix."""
Guolin Ke's avatar
Guolin Ke committed
1066
1067
1068
1069
        if len(csc.indices) != len(csc.data):
            raise ValueError('Length mismatch: {} vs {}'.format(len(csc.indices), len(csc.data)))
        self.handle = ctypes.c_void_p()

1070
1071
        ptr_indptr, type_ptr_indptr, __ = c_int_array(csc.indptr)
        ptr_data, type_ptr_data, _ = c_float_array(csc.data)
Guolin Ke's avatar
Guolin Ke committed
1072

1073
1074
1075
        assert csc.shape[0] <= MAX_INT32
        csc.indices = csc.indices.astype(np.int32, copy=False)

Guolin Ke's avatar
Guolin Ke committed
1076
1077
        _safe_call(_LIB.LGBM_DatasetCreateFromCSC(
            ptr_indptr,
Guolin Ke's avatar
Guolin Ke committed
1078
            ctypes.c_int(type_ptr_indptr),
Guolin Ke's avatar
Guolin Ke committed
1079
1080
            csc.indices.ctypes.data_as(ctypes.POINTER(ctypes.c_int32)),
            ptr_data,
Guolin Ke's avatar
Guolin Ke committed
1081
1082
1083
1084
            ctypes.c_int(type_ptr_data),
            ctypes.c_int64(len(csc.indptr)),
            ctypes.c_int64(len(csc.data)),
            ctypes.c_int64(csc.shape[0]),
Guolin Ke's avatar
Guolin Ke committed
1085
1086
1087
            c_str(params_str),
            ref_dataset,
            ctypes.byref(self.handle)))
Nikita Titov's avatar
Nikita Titov committed
1088
        return self
Guolin Ke's avatar
Guolin Ke committed
1089

wxchan's avatar
wxchan committed
1090
    def construct(self):
1091
1092
1093
1094
1095
        """Lazy init.

        Returns
        -------
        self : Dataset
Nikita Titov's avatar
Nikita Titov committed
1096
            Constructed Dataset object.
1097
        """
1098
        if self.handle is None:
wxchan's avatar
wxchan committed
1099
1100
            if self.reference is not None:
                if self.used_indices is None:
1101
                    # create valid
1102
                    self._lazy_init(self.data, label=self.label, reference=self.reference,
1103
1104
                                    weight=self.weight, group=self.group,
                                    init_score=self.init_score, predictor=self._predictor,
1105
                                    silent=self.silent, feature_name=self.feature_name, params=self.params)
wxchan's avatar
wxchan committed
1106
                else:
1107
                    # construct subset
wxchan's avatar
wxchan committed
1108
                    used_indices = list_to_1d_numpy(self.used_indices, np.int32, name='used_indices')
1109
                    assert used_indices.flags.c_contiguous
Guolin Ke's avatar
Guolin Ke committed
1110
                    if self.reference.group is not None:
1111
                        group_info = np.array(self.reference.group).astype(np.int32, copy=False)
1112
1113
                        _, self.group = np.unique(np.repeat(range_(len(group_info)), repeats=group_info)[self.used_indices],
                                                  return_counts=True)
1114
                    self.handle = ctypes.c_void_p()
wxchan's avatar
wxchan committed
1115
1116
                    params_str = param_dict_to_str(self.params)
                    _safe_call(_LIB.LGBM_DatasetGetSubset(
1117
                        self.reference.construct().handle,
wxchan's avatar
wxchan committed
1118
                        used_indices.ctypes.data_as(ctypes.POINTER(ctypes.c_int32)),
Guolin Ke's avatar
Guolin Ke committed
1119
                        ctypes.c_int(used_indices.shape[0]),
wxchan's avatar
wxchan committed
1120
1121
                        c_str(params_str),
                        ctypes.byref(self.handle)))
Guolin Ke's avatar
Guolin Ke committed
1122
1123
                    if not self.free_raw_data:
                        self.get_data()
Guolin Ke's avatar
Guolin Ke committed
1124
1125
                    if self.group is not None:
                        self.set_group(self.group)
wxchan's avatar
wxchan committed
1126
1127
                    if self.get_label() is None:
                        raise ValueError("Label should not be None.")
Guolin Ke's avatar
Guolin Ke committed
1128
1129
1130
                    if isinstance(self._predictor, _InnerPredictor) and self._predictor is not self.reference._predictor:
                        self.get_data()
                        self._set_init_score_by_predictor(self._predictor, self.data, used_indices)
wxchan's avatar
wxchan committed
1131
            else:
1132
                # create train
1133
                self._lazy_init(self.data, label=self.label,
1134
1135
1136
                                weight=self.weight, group=self.group,
                                init_score=self.init_score, predictor=self._predictor,
                                silent=self.silent, feature_name=self.feature_name,
1137
                                categorical_feature=self.categorical_feature, params=self.params)
wxchan's avatar
wxchan committed
1138
1139
1140
            if self.free_raw_data:
                self.data = None
        return self
wxchan's avatar
wxchan committed
1141

wxchan's avatar
wxchan committed
1142
    def create_valid(self, data, label=None, weight=None, group=None,
1143
                     init_score=None, silent=False, params=None):
1144
        """Create validation data align with current Dataset.
wxchan's avatar
wxchan committed
1145
1146
1147

        Parameters
        ----------
1148
        data : string, numpy array, pandas DataFrame, H2O DataTable's Frame, scipy.sparse or list of numpy arrays
wxchan's avatar
wxchan committed
1149
            Data source of Dataset.
1150
            If string, it represents the path to txt file.
1151
        label : list, numpy 1-D array, pandas Series / one-column DataFrame or None, optional (default=None)
1152
1153
            Label of the data.
        weight : list, numpy 1-D array, pandas Series or None, optional (default=None)
wxchan's avatar
wxchan committed
1154
            Weight for each instance.
1155
        group : list, numpy 1-D array, pandas Series or None, optional (default=None)
1156
            Group/query size for Dataset.
1157
        init_score : list, numpy 1-D array, pandas Series or None, optional (default=None)
1158
            Init score for Dataset.
1159
1160
        silent : bool, optional (default=False)
            Whether to print messages during construction.
Nikita Titov's avatar
Nikita Titov committed
1161
        params : dict or None, optional (default=None)
1162
            Other parameters for validation Dataset.
1163
1164
1165

        Returns
        -------
Nikita Titov's avatar
Nikita Titov committed
1166
1167
        valid : Dataset
            Validation Dataset with reference to self.
wxchan's avatar
wxchan committed
1168
        """
1169
        ret = Dataset(data, label=label, reference=self,
1170
1171
                      weight=weight, group=group, init_score=init_score,
                      silent=silent, params=params, free_raw_data=self.free_raw_data)
wxchan's avatar
wxchan committed
1172
        ret._predictor = self._predictor
1173
        ret.pandas_categorical = self.pandas_categorical
wxchan's avatar
wxchan committed
1174
        return ret
wxchan's avatar
wxchan committed
1175

wxchan's avatar
wxchan committed
1176
    def subset(self, used_indices, params=None):
1177
        """Get subset of current Dataset.
wxchan's avatar
wxchan committed
1178
1179
1180
1181

        Parameters
        ----------
        used_indices : list of int
1182
            Indices used to create the subset.
Nikita Titov's avatar
Nikita Titov committed
1183
        params : dict or None, optional (default=None)
1184
            These parameters will be passed to Dataset constructor.
1185
1186
1187
1188
1189

        Returns
        -------
        subset : Dataset
            Subset of the current Dataset.
wxchan's avatar
wxchan committed
1190
        """
wxchan's avatar
wxchan committed
1191
1192
        if params is None:
            params = self.params
wxchan's avatar
wxchan committed
1193
        ret = Dataset(None, reference=self, feature_name=self.feature_name,
1194
1195
                      categorical_feature=self.categorical_feature, params=params,
                      free_raw_data=self.free_raw_data)
wxchan's avatar
wxchan committed
1196
        ret._predictor = self._predictor
1197
        ret.pandas_categorical = self.pandas_categorical
1198
        ret.used_indices = sorted(used_indices)
wxchan's avatar
wxchan committed
1199
1200
1201
        return ret

    def save_binary(self, filename):
1202
        """Save Dataset to a binary file.
wxchan's avatar
wxchan committed
1203

1204
1205
1206
1207
1208
        .. note::

            Please note that `init_score` is not saved in binary file.
            If you need it, please set it again after loading Dataset.

wxchan's avatar
wxchan committed
1209
1210
1211
1212
        Parameters
        ----------
        filename : string
            Name of the output file.
Nikita Titov's avatar
Nikita Titov committed
1213
1214
1215
1216
1217

        Returns
        -------
        self : Dataset
            Returns self.
wxchan's avatar
wxchan committed
1218
1219
1220
1221
        """
        _safe_call(_LIB.LGBM_DatasetSaveBinary(
            self.construct().handle,
            c_str(filename)))
Nikita Titov's avatar
Nikita Titov committed
1222
        return self
wxchan's avatar
wxchan committed
1223
1224

    def _update_params(self, params):
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
        params = copy.deepcopy(params)

        def update():
            if not self.params:
                self.params = params
            else:
                self.params_back_up = copy.deepcopy(self.params)
                self.params.update(params)

        if self.handle is None:
            update()
        elif params is not None:
            ret = _LIB.LGBM_DatasetUpdateParamChecking(
                c_str(param_dict_to_str(self.params)),
                c_str(param_dict_to_str(params)))
            if ret != 0:
                # could be updated if data is not freed
                if self.data is not None:
                    update()
                    self._free_handle()
                else:
                    raise LightGBMError(decode_string(_LIB.LGBM_GetLastError()))
Nikita Titov's avatar
Nikita Titov committed
1247
        return self
wxchan's avatar
wxchan committed
1248

1249
    def _reverse_update_params(self):
1250
1251
1252
        if self.handle is None:
            self.params = copy.deepcopy(self.params_back_up)
            self.params_back_up = None
Nikita Titov's avatar
Nikita Titov committed
1253
        return self
1254

wxchan's avatar
wxchan committed
1255
    def set_field(self, field_name, data):
wxchan's avatar
wxchan committed
1256
        """Set property into the Dataset.
wxchan's avatar
wxchan committed
1257
1258
1259

        Parameters
        ----------
Nikita Titov's avatar
Nikita Titov committed
1260
        field_name : string
1261
            The field name of the information.
1262
        data : list, numpy 1-D array, pandas Series or None
1263
            The array of data to be set.
Nikita Titov's avatar
Nikita Titov committed
1264
1265
1266
1267
1268

        Returns
        -------
        self : Dataset
            Dataset with set property.
wxchan's avatar
wxchan committed
1269
        """
1270
1271
        if self.handle is None:
            raise Exception("Cannot set %s before construct dataset" % field_name)
wxchan's avatar
wxchan committed
1272
        if data is None:
1273
            # set to None
wxchan's avatar
wxchan committed
1274
1275
1276
1277
            _safe_call(_LIB.LGBM_DatasetSetField(
                self.handle,
                c_str(field_name),
                None,
Guolin Ke's avatar
Guolin Ke committed
1278
1279
                ctypes.c_int(0),
                ctypes.c_int(FIELD_TYPE_MAPPER[field_name])))
Nikita Titov's avatar
Nikita Titov committed
1280
            return self
Guolin Ke's avatar
Guolin Ke committed
1281
1282
1283
1284
1285
        dtype = np.float32
        if field_name == 'group':
            dtype = np.int32
        elif field_name == 'init_score':
            dtype = np.float64
1286
        data = list_to_1d_numpy(data, dtype, name=field_name)
1287
1288
        if data.dtype == np.float32 or data.dtype == np.float64:
            ptr_data, type_data, _ = c_float_array(data)
wxchan's avatar
wxchan committed
1289
        elif data.dtype == np.int32:
1290
            ptr_data, type_data, _ = c_int_array(data)
wxchan's avatar
wxchan committed
1291
        else:
Nikita Titov's avatar
Nikita Titov committed
1292
            raise TypeError("Expected np.float32/64 or np.int32, met type({})".format(data.dtype))
wxchan's avatar
wxchan committed
1293
        if type_data != FIELD_TYPE_MAPPER[field_name]:
1294
            raise TypeError("Input type error for set_field")
wxchan's avatar
wxchan committed
1295
1296
1297
1298
        _safe_call(_LIB.LGBM_DatasetSetField(
            self.handle,
            c_str(field_name),
            ptr_data,
Guolin Ke's avatar
Guolin Ke committed
1299
1300
            ctypes.c_int(len(data)),
            ctypes.c_int(type_data)))
1301
        self.version += 1
Nikita Titov's avatar
Nikita Titov committed
1302
        return self
wxchan's avatar
wxchan committed
1303

wxchan's avatar
wxchan committed
1304
1305
    def get_field(self, field_name):
        """Get property from the Dataset.
wxchan's avatar
wxchan committed
1306
1307
1308

        Parameters
        ----------
Nikita Titov's avatar
Nikita Titov committed
1309
        field_name : string
1310
            The field name of the information.
wxchan's avatar
wxchan committed
1311
1312
1313

        Returns
        -------
1314
1315
        info : numpy array
            A numpy array with information from the Dataset.
Guolin Ke's avatar
Guolin Ke committed
1316
        """
1317
        if self.handle is None:
1318
            raise Exception("Cannot get %s before construct Dataset" % field_name)
Guolin Ke's avatar
Guolin Ke committed
1319
1320
        tmp_out_len = ctypes.c_int()
        out_type = ctypes.c_int()
wxchan's avatar
wxchan committed
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
        ret = ctypes.POINTER(ctypes.c_void_p)()
        _safe_call(_LIB.LGBM_DatasetGetField(
            self.handle,
            c_str(field_name),
            ctypes.byref(tmp_out_len),
            ctypes.byref(ret),
            ctypes.byref(out_type)))
        if out_type.value != FIELD_TYPE_MAPPER[field_name]:
            raise TypeError("Return type error for get_field")
        if tmp_out_len.value == 0:
            return None
        if out_type.value == C_API_DTYPE_INT32:
            return cint32_array_to_numpy(ctypes.cast(ret, ctypes.POINTER(ctypes.c_int32)), tmp_out_len.value)
        elif out_type.value == C_API_DTYPE_FLOAT32:
            return cfloat32_array_to_numpy(ctypes.cast(ret, ctypes.POINTER(ctypes.c_float)), tmp_out_len.value)
Guolin Ke's avatar
Guolin Ke committed
1336
1337
        elif out_type.value == C_API_DTYPE_FLOAT64:
            return cfloat64_array_to_numpy(ctypes.cast(ret, ctypes.POINTER(ctypes.c_double)), tmp_out_len.value)
1338
        else:
wxchan's avatar
wxchan committed
1339
            raise TypeError("Unknown type")
Guolin Ke's avatar
Guolin Ke committed
1340

1341
    def set_categorical_feature(self, categorical_feature):
1342
        """Set categorical features.
1343
1344
1345

        Parameters
        ----------
1346
1347
        categorical_feature : list of int or strings
            Names or indices of categorical features.
Nikita Titov's avatar
Nikita Titov committed
1348
1349
1350
1351
1352

        Returns
        -------
        self : Dataset
            Dataset with set categorical features.
1353
1354
        """
        if self.categorical_feature == categorical_feature:
Nikita Titov's avatar
Nikita Titov committed
1355
            return self
1356
        if self.data is not None:
1357
1358
            if self.categorical_feature is None:
                self.categorical_feature = categorical_feature
Nikita Titov's avatar
Nikita Titov committed
1359
                return self._free_handle()
1360
1361
            elif categorical_feature == 'auto':
                warnings.warn('Using categorical_feature in Dataset.')
Nikita Titov's avatar
Nikita Titov committed
1362
                return self
1363
            else:
1364
1365
                warnings.warn('categorical_feature in Dataset is overridden.\n'
                              'New categorical_feature is {}'.format(sorted(list(categorical_feature))))
1366
                self.categorical_feature = categorical_feature
Nikita Titov's avatar
Nikita Titov committed
1367
                return self._free_handle()
1368
        else:
1369
1370
            raise LightGBMError("Cannot set categorical feature after freed raw data, "
                                "set free_raw_data=False when construct Dataset to avoid this.")
1371

Guolin Ke's avatar
Guolin Ke committed
1372
    def _set_predictor(self, predictor):
1373
1374
1375
1376
        """Set predictor for continued training.

        It is not recommended for user to call this function.
        Please use init_model argument in engine.train() or engine.cv() instead.
Guolin Ke's avatar
Guolin Ke committed
1377
1378
        """
        if predictor is self._predictor:
Nikita Titov's avatar
Nikita Titov committed
1379
            return self
1380
1381
1382
        if self.data is not None or (self.used_indices is not None
                                     and self.reference is not None
                                     and self.reference.data is not None):
Guolin Ke's avatar
Guolin Ke committed
1383
            self._predictor = predictor
Nikita Titov's avatar
Nikita Titov committed
1384
            return self._free_handle()
Guolin Ke's avatar
Guolin Ke committed
1385
        else:
1386
1387
            raise LightGBMError("Cannot set predictor after freed raw data, "
                                "set free_raw_data=False when construct Dataset to avoid this.")
Guolin Ke's avatar
Guolin Ke committed
1388
1389

    def set_reference(self, reference):
1390
        """Set reference Dataset.
Guolin Ke's avatar
Guolin Ke committed
1391
1392
1393
1394

        Parameters
        ----------
        reference : Dataset
1395
            Reference that is used as a template to construct the current Dataset.
Nikita Titov's avatar
Nikita Titov committed
1396
1397
1398
1399
1400

        Returns
        -------
        self : Dataset
            Dataset with set reference.
Guolin Ke's avatar
Guolin Ke committed
1401
        """
1402
1403
1404
        self.set_categorical_feature(reference.categorical_feature) \
            .set_feature_name(reference.feature_name) \
            ._set_predictor(reference._predictor)
1405
1406
        # we're done if self and reference share a common upstrem reference
        if self.get_ref_chain().intersection(reference.get_ref_chain()):
Nikita Titov's avatar
Nikita Titov committed
1407
            return self
Guolin Ke's avatar
Guolin Ke committed
1408
1409
        if self.data is not None:
            self.reference = reference
Nikita Titov's avatar
Nikita Titov committed
1410
            return self._free_handle()
Guolin Ke's avatar
Guolin Ke committed
1411
        else:
1412
1413
            raise LightGBMError("Cannot set reference after freed raw data, "
                                "set free_raw_data=False when construct Dataset to avoid this.")
Guolin Ke's avatar
Guolin Ke committed
1414
1415

    def set_feature_name(self, feature_name):
1416
        """Set feature name.
Guolin Ke's avatar
Guolin Ke committed
1417
1418
1419

        Parameters
        ----------
1420
1421
        feature_name : list of strings
            Feature names.
Nikita Titov's avatar
Nikita Titov committed
1422
1423
1424
1425
1426

        Returns
        -------
        self : Dataset
            Dataset with set feature name.
Guolin Ke's avatar
Guolin Ke committed
1427
        """
1428
1429
        if feature_name != 'auto':
            self.feature_name = feature_name
1430
        if self.handle is not None and feature_name is not None and feature_name != 'auto':
wxchan's avatar
wxchan committed
1431
            if len(feature_name) != self.num_feature():
1432
1433
                raise ValueError("Length of feature_name({}) and num_feature({}) don't match"
                                 .format(len(feature_name), self.num_feature()))
1434
            c_feature_name = [c_str(name) for name in feature_name]
wxchan's avatar
wxchan committed
1435
1436
1437
            _safe_call(_LIB.LGBM_DatasetSetFeatureNames(
                self.handle,
                c_array(ctypes.c_char_p, c_feature_name),
Guolin Ke's avatar
Guolin Ke committed
1438
                ctypes.c_int(len(feature_name))))
Nikita Titov's avatar
Nikita Titov committed
1439
        return self
Guolin Ke's avatar
Guolin Ke committed
1440
1441

    def set_label(self, label):
1442
        """Set label of Dataset.
Guolin Ke's avatar
Guolin Ke committed
1443
1444
1445

        Parameters
        ----------
1446
        label : list, numpy 1-D array, pandas Series / one-column DataFrame or None
1447
            The label information to be set into Dataset.
Nikita Titov's avatar
Nikita Titov committed
1448
1449
1450
1451
1452

        Returns
        -------
        self : Dataset
            Dataset with set label.
Guolin Ke's avatar
Guolin Ke committed
1453
1454
        """
        self.label = label
1455
        if self.handle is not None:
1456
            label = list_to_1d_numpy(_label_from_pandas(label), name='label')
wxchan's avatar
wxchan committed
1457
            self.set_field('label', label)
1458
            self.label = self.get_field('label')  # original values can be modified at cpp side
Nikita Titov's avatar
Nikita Titov committed
1459
        return self
Guolin Ke's avatar
Guolin Ke committed
1460
1461

    def set_weight(self, weight):
1462
        """Set weight of each instance.
Guolin Ke's avatar
Guolin Ke committed
1463
1464
1465

        Parameters
        ----------
1466
        weight : list, numpy 1-D array, pandas Series or None
1467
            Weight to be set for each data point.
Nikita Titov's avatar
Nikita Titov committed
1468
1469
1470
1471
1472

        Returns
        -------
        self : Dataset
            Dataset with set weight.
Guolin Ke's avatar
Guolin Ke committed
1473
        """
1474
1475
        if weight is not None and np.all(weight == 1):
            weight = None
Guolin Ke's avatar
Guolin Ke committed
1476
        self.weight = weight
1477
        if self.handle is not None and weight is not None:
wxchan's avatar
wxchan committed
1478
1479
            weight = list_to_1d_numpy(weight, name='weight')
            self.set_field('weight', weight)
1480
            self.weight = self.get_field('weight')  # original values can be modified at cpp side
Nikita Titov's avatar
Nikita Titov committed
1481
        return self
Guolin Ke's avatar
Guolin Ke committed
1482
1483

    def set_init_score(self, init_score):
1484
        """Set init score of Booster to start from.
Guolin Ke's avatar
Guolin Ke committed
1485
1486
1487

        Parameters
        ----------
1488
        init_score : list, numpy 1-D array, pandas Series or None
1489
            Init score for Booster.
Nikita Titov's avatar
Nikita Titov committed
1490
1491
1492
1493
1494

        Returns
        -------
        self : Dataset
            Dataset with set init score.
Guolin Ke's avatar
Guolin Ke committed
1495
1496
        """
        self.init_score = init_score
1497
        if self.handle is not None and init_score is not None:
Guolin Ke's avatar
Guolin Ke committed
1498
            init_score = list_to_1d_numpy(init_score, np.float64, name='init_score')
wxchan's avatar
wxchan committed
1499
            self.set_field('init_score', init_score)
1500
            self.init_score = self.get_field('init_score')  # original values can be modified at cpp side
Nikita Titov's avatar
Nikita Titov committed
1501
        return self
Guolin Ke's avatar
Guolin Ke committed
1502
1503

    def set_group(self, group):
1504
        """Set group size of Dataset (used for ranking).
Guolin Ke's avatar
Guolin Ke committed
1505
1506
1507

        Parameters
        ----------
1508
        group : list, numpy 1-D array, pandas Series or None
1509
            Group size of each group.
Nikita Titov's avatar
Nikita Titov committed
1510
1511
1512
1513
1514

        Returns
        -------
        self : Dataset
            Dataset with set group.
Guolin Ke's avatar
Guolin Ke committed
1515
1516
        """
        self.group = group
1517
        if self.handle is not None and group is not None:
wxchan's avatar
wxchan committed
1518
1519
            group = list_to_1d_numpy(group, np.int32, name='group')
            self.set_field('group', group)
Nikita Titov's avatar
Nikita Titov committed
1520
        return self
Guolin Ke's avatar
Guolin Ke committed
1521
1522

    def get_label(self):
1523
        """Get the label of the Dataset.
Guolin Ke's avatar
Guolin Ke committed
1524
1525
1526

        Returns
        -------
Nikita Titov's avatar
Nikita Titov committed
1527
        label : numpy array or None
1528
            The label information from the Dataset.
Guolin Ke's avatar
Guolin Ke committed
1529
        """
1530
        if self.label is None:
wxchan's avatar
wxchan committed
1531
            self.label = self.get_field('label')
Guolin Ke's avatar
Guolin Ke committed
1532
1533
1534
        return self.label

    def get_weight(self):
1535
        """Get the weight of the Dataset.
Guolin Ke's avatar
Guolin Ke committed
1536
1537
1538

        Returns
        -------
Nikita Titov's avatar
Nikita Titov committed
1539
        weight : numpy array or None
1540
            Weight for each data point from the Dataset.
Guolin Ke's avatar
Guolin Ke committed
1541
        """
1542
        if self.weight is None:
wxchan's avatar
wxchan committed
1543
            self.weight = self.get_field('weight')
Guolin Ke's avatar
Guolin Ke committed
1544
1545
1546
        return self.weight

    def get_init_score(self):
1547
        """Get the initial score of the Dataset.
Guolin Ke's avatar
Guolin Ke committed
1548
1549
1550

        Returns
        -------
Nikita Titov's avatar
Nikita Titov committed
1551
        init_score : numpy array or None
1552
            Init score of Booster.
Guolin Ke's avatar
Guolin Ke committed
1553
        """
1554
        if self.init_score is None:
wxchan's avatar
wxchan committed
1555
            self.init_score = self.get_field('init_score')
Guolin Ke's avatar
Guolin Ke committed
1556
1557
        return self.init_score

1558
1559
1560
1561
1562
    def get_data(self):
        """Get the raw data of the Dataset.

        Returns
        -------
1563
        data : string, numpy array, pandas DataFrame, H2O DataTable's Frame, scipy.sparse, list of numpy arrays or None
1564
1565
1566
1567
            Raw data used in the Dataset construction.
        """
        if self.handle is None:
            raise Exception("Cannot get data before construct Dataset")
Guolin Ke's avatar
Guolin Ke committed
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
        if self.need_slice and self.used_indices is not None and self.reference is not None:
            self.data = self.reference.data
            if self.data is not None:
                if isinstance(self.data, np.ndarray) or scipy.sparse.issparse(self.data):
                    self.data = self.data[self.used_indices, :]
                elif isinstance(self.data, DataFrame):
                    self.data = self.data.iloc[self.used_indices].copy()
                elif isinstance(self.data, DataTable):
                    self.data = self.data[self.used_indices, :]
                else:
                    warnings.warn("Cannot subset {} type of raw data.\n"
                                  "Returning original raw data".format(type(self.data).__name__))
1580
            self.need_slice = False
Guolin Ke's avatar
Guolin Ke committed
1581
1582
1583
        if self.data is None:
            raise LightGBMError("Cannot call `get_data` after freed raw data, "
                                "set free_raw_data=False when construct Dataset to avoid this.")
1584
1585
        return self.data

Guolin Ke's avatar
Guolin Ke committed
1586
    def get_group(self):
1587
        """Get the group of the Dataset.
Guolin Ke's avatar
Guolin Ke committed
1588
1589
1590

        Returns
        -------
Nikita Titov's avatar
Nikita Titov committed
1591
        group : numpy array or None
1592
            Group size of each group.
Guolin Ke's avatar
Guolin Ke committed
1593
        """
1594
        if self.group is None:
wxchan's avatar
wxchan committed
1595
            self.group = self.get_field('group')
Guolin Ke's avatar
Guolin Ke committed
1596
1597
            if self.group is not None:
                # group data from LightGBM is boundaries data, need to convert to group size
Nikita Titov's avatar
Nikita Titov committed
1598
                self.group = np.diff(self.group)
Guolin Ke's avatar
Guolin Ke committed
1599
1600
1601
        return self.group

    def num_data(self):
1602
        """Get the number of rows in the Dataset.
Guolin Ke's avatar
Guolin Ke committed
1603
1604
1605

        Returns
        -------
1606
1607
        number_of_rows : int
            The number of rows in the Dataset.
Guolin Ke's avatar
Guolin Ke committed
1608
        """
1609
        if self.handle is not None:
Guolin Ke's avatar
Guolin Ke committed
1610
            ret = ctypes.c_int()
wxchan's avatar
wxchan committed
1611
1612
1613
            _safe_call(_LIB.LGBM_DatasetGetNumData(self.handle,
                                                   ctypes.byref(ret)))
            return ret.value
Guolin Ke's avatar
Guolin Ke committed
1614
        else:
1615
            raise LightGBMError("Cannot get num_data before construct dataset")
Guolin Ke's avatar
Guolin Ke committed
1616
1617

    def num_feature(self):
1618
        """Get the number of columns (features) in the Dataset.
Guolin Ke's avatar
Guolin Ke committed
1619
1620
1621

        Returns
        -------
1622
1623
        number_of_columns : int
            The number of columns (features) in the Dataset.
Guolin Ke's avatar
Guolin Ke committed
1624
        """
1625
        if self.handle is not None:
Guolin Ke's avatar
Guolin Ke committed
1626
            ret = ctypes.c_int()
wxchan's avatar
wxchan committed
1627
1628
1629
            _safe_call(_LIB.LGBM_DatasetGetNumFeature(self.handle,
                                                      ctypes.byref(ret)))
            return ret.value
Guolin Ke's avatar
Guolin Ke committed
1630
        else:
1631
            raise LightGBMError("Cannot get num_feature before construct dataset")
Guolin Ke's avatar
Guolin Ke committed
1632

1633
    def get_ref_chain(self, ref_limit=100):
1634
1635
1636
1637
1638
        """Get a chain of Dataset objects.

        Starts with r, then goes to r.reference (if exists),
        then to r.reference.reference, etc.
        until we hit ``ref_limit`` or a reference loop.
1639
1640
1641
1642
1643

        Parameters
        ----------
        ref_limit : int, optional (default=100)
            The limit number of references.
1644
1645
1646

        Returns
        -------
1647
1648
1649
        ref_chain : set of Dataset
            Chain of references of the Datasets.
        """
1650
        head = self
1651
        ref_chain = set()
1652
1653
        while len(ref_chain) < ref_limit:
            if isinstance(head, Dataset):
1654
                ref_chain.add(head)
1655
1656
1657
1658
1659
1660
                if (head.reference is not None) and (head.reference not in ref_chain):
                    head = head.reference
                else:
                    break
            else:
                break
Nikita Titov's avatar
Nikita Titov committed
1661
        return ref_chain
1662

1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
    def add_features_from(self, other):
        """Add features from other Dataset to the current Dataset.

        Both Datasets must be constructed before calling this method.

        Parameters
        ----------
        other : Dataset
            The Dataset to take features from.

        Returns
        -------
        self : Dataset
            Dataset with the new features added.
        """
        if self.handle is None or other.handle is None:
            raise ValueError('Both source and target Datasets must be constructed before adding features')
        _safe_call(_LIB.LGBM_DatasetAddFeaturesFrom(self.handle, other.handle))
        return self

1683
    def _dump_text(self, filename):
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
        """Save Dataset to a text file.

        This format cannot be loaded back in by LightGBM, but is useful for debugging purposes.

        Parameters
        ----------
        filename : string
            Name of the output file.

        Returns
        -------
        self : Dataset
            Returns self.
        """
        _safe_call(_LIB.LGBM_DatasetDumpText(
            self.construct().handle,
            c_str(filename)))
        return self

wxchan's avatar
wxchan committed
1703

wxchan's avatar
wxchan committed
1704
class Booster(object):
1705
    """Booster in LightGBM."""
1706

1707
    def __init__(self, params=None, train_set=None, model_file=None, model_str=None, silent=False):
1708
        """Initialize the Booster.
wxchan's avatar
wxchan committed
1709
1710
1711

        Parameters
        ----------
Nikita Titov's avatar
Nikita Titov committed
1712
        params : dict or None, optional (default=None)
1713
1714
1715
1716
            Parameters for Booster.
        train_set : Dataset or None, optional (default=None)
            Training dataset.
        model_file : string or None, optional (default=None)
wxchan's avatar
wxchan committed
1717
            Path to the model file.
1718
1719
        model_str : string or None, optional (default=None)
            Model will be loaded from this string.
1720
1721
        silent : bool, optional (default=False)
            Whether to print messages during construction.
wxchan's avatar
wxchan committed
1722
        """
1723
        self.handle = None
1724
        self.network = False
wxchan's avatar
wxchan committed
1725
        self.__need_reload_eval_info = True
1726
        self._train_data_name = "training"
wxchan's avatar
wxchan committed
1727
        self.__attr = {}
1728
        self.__set_objective_to_none = False
wxchan's avatar
wxchan committed
1729
        self.best_iteration = -1
wxchan's avatar
wxchan committed
1730
        self.best_score = {}
1731
        params = {} if params is None else copy.deepcopy(params)
1732
        # user can set verbose with params, it has higher priority
1733
        if not any(verbose_alias in params for verbose_alias in _ConfigAliases.get("verbosity")) and silent:
1734
            params["verbose"] = -1
wxchan's avatar
wxchan committed
1735
        if train_set is not None:
1736
            # Training task
wxchan's avatar
wxchan committed
1737
            if not isinstance(train_set, Dataset):
1738
1739
                raise TypeError('Training data should be Dataset instance, met {}'
                                .format(type(train_set).__name__))
1740
            # set network if necessary
1741
            for alias in _ConfigAliases.get("machines"):
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
                if alias in params:
                    machines = params[alias]
                    if isinstance(machines, string_type):
                        num_machines = len(machines.split(','))
                    elif isinstance(machines, (list, set)):
                        num_machines = len(machines)
                        machines = ','.join(machines)
                    else:
                        raise ValueError("Invalid machines in params.")
                    self.set_network(machines,
                                     local_listen_port=params.get("local_listen_port", 12400),
                                     listen_time_out=params.get("listen_time_out", 120),
1754
                                     num_machines=params.setdefault("num_machines", num_machines))
1755
                    break
1756
            # construct booster object
1757
1758
1759
1760
            train_set.construct()
            # copy the parameters from train_set
            params.update(train_set.get_params())
            params_str = param_dict_to_str(params)
1761
            self.handle = ctypes.c_void_p()
wxchan's avatar
wxchan committed
1762
            _safe_call(_LIB.LGBM_BoosterCreate(
1763
                train_set.handle,
wxchan's avatar
wxchan committed
1764
1765
                c_str(params_str),
                ctypes.byref(self.handle)))
1766
            # save reference to data
wxchan's avatar
wxchan committed
1767
1768
1769
1770
            self.train_set = train_set
            self.valid_sets = []
            self.name_valid_sets = []
            self.__num_dataset = 1
Guolin Ke's avatar
Guolin Ke committed
1771
1772
            self.__init_predictor = train_set._predictor
            if self.__init_predictor is not None:
wxchan's avatar
wxchan committed
1773
1774
                _safe_call(_LIB.LGBM_BoosterMerge(
                    self.handle,
Guolin Ke's avatar
Guolin Ke committed
1775
                    self.__init_predictor.handle))
Guolin Ke's avatar
Guolin Ke committed
1776
            out_num_class = ctypes.c_int(0)
wxchan's avatar
wxchan committed
1777
1778
1779
1780
            _safe_call(_LIB.LGBM_BoosterGetNumClasses(
                self.handle,
                ctypes.byref(out_num_class)))
            self.__num_class = out_num_class.value
1781
            # buffer for inner predict
wxchan's avatar
wxchan committed
1782
1783
1784
            self.__inner_predict_buffer = [None]
            self.__is_predicted_cur_iter = [False]
            self.__get_eval_info()
1785
            self.pandas_categorical = train_set.pandas_categorical
1786
            self.train_set_version = train_set.version
wxchan's avatar
wxchan committed
1787
        elif model_file is not None:
1788
            # Prediction task
Guolin Ke's avatar
Guolin Ke committed
1789
            out_num_iterations = ctypes.c_int(0)
1790
            self.handle = ctypes.c_void_p()
wxchan's avatar
wxchan committed
1791
1792
1793
1794
            _safe_call(_LIB.LGBM_BoosterCreateFromModelfile(
                c_str(model_file),
                ctypes.byref(out_num_iterations),
                ctypes.byref(self.handle)))
Guolin Ke's avatar
Guolin Ke committed
1795
            out_num_class = ctypes.c_int(0)
wxchan's avatar
wxchan committed
1796
1797
1798
1799
            _safe_call(_LIB.LGBM_BoosterGetNumClasses(
                self.handle,
                ctypes.byref(out_num_class)))
            self.__num_class = out_num_class.value
1800
            self.pandas_categorical = _load_pandas_categorical(file_name=model_file)
1801
1802
        elif model_str is not None:
            self.model_from_string(model_str, not silent)
wxchan's avatar
wxchan committed
1803
        else:
1804
1805
            raise TypeError('Need at least one training dataset or model file or model string '
                            'to create Booster instance')
1806
        self.params = params
wxchan's avatar
wxchan committed
1807
1808

    def __del__(self):
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
        try:
            if self.network:
                self.free_network()
        except AttributeError:
            pass
        try:
            if self.handle is not None:
                _safe_call(_LIB.LGBM_BoosterFree(self.handle))
        except AttributeError:
            pass
wxchan's avatar
wxchan committed
1819

wxchan's avatar
wxchan committed
1820
1821
1822
1823
    def __copy__(self):
        return self.__deepcopy__(None)

    def __deepcopy__(self, _):
1824
        model_str = self.model_to_string(num_iteration=-1)
1825
        booster = Booster(model_str=model_str)
1826
        return booster
wxchan's avatar
wxchan committed
1827
1828
1829
1830
1831
1832
1833

    def __getstate__(self):
        this = self.__dict__.copy()
        handle = this['handle']
        this.pop('train_set', None)
        this.pop('valid_sets', None)
        if handle is not None:
1834
            this["handle"] = self.model_to_string(num_iteration=-1)
wxchan's avatar
wxchan committed
1835
1836
1837
        return this

    def __setstate__(self, state):
1838
1839
        model_str = state.get('handle', None)
        if model_str is not None:
wxchan's avatar
wxchan committed
1840
            handle = ctypes.c_void_p()
Guolin Ke's avatar
Guolin Ke committed
1841
            out_num_iterations = ctypes.c_int(0)
1842
1843
1844
1845
            _safe_call(_LIB.LGBM_BoosterLoadModelFromString(
                c_str(model_str),
                ctypes.byref(out_num_iterations),
                ctypes.byref(handle)))
wxchan's avatar
wxchan committed
1846
1847
1848
            state['handle'] = handle
        self.__dict__.update(state)

wxchan's avatar
wxchan committed
1849
    def free_dataset(self):
Nikita Titov's avatar
Nikita Titov committed
1850
1851
1852
1853
1854
1855
1856
        """Free Booster's Datasets.

        Returns
        -------
        self : Booster
            Booster without Datasets.
        """
wxchan's avatar
wxchan committed
1857
1858
        self.__dict__.pop('train_set', None)
        self.__dict__.pop('valid_sets', None)
1859
        self.__num_dataset = 0
Nikita Titov's avatar
Nikita Titov committed
1860
        return self
wxchan's avatar
wxchan committed
1861

1862
1863
1864
    def _free_buffer(self):
        self.__inner_predict_buffer = []
        self.__is_predicted_cur_iter = []
Nikita Titov's avatar
Nikita Titov committed
1865
        return self
1866

1867
1868
1869
1870
1871
1872
    def set_network(self, machines, local_listen_port=12400,
                    listen_time_out=120, num_machines=1):
        """Set the network configuration.

        Parameters
        ----------
Nikita Titov's avatar
Nikita Titov committed
1873
        machines : list, set or string
1874
            Names of machines.
Nikita Titov's avatar
Nikita Titov committed
1875
        local_listen_port : int, optional (default=12400)
1876
            TCP listen port for local machines.
Nikita Titov's avatar
Nikita Titov committed
1877
        listen_time_out : int, optional (default=120)
1878
            Socket time-out in minutes.
Nikita Titov's avatar
Nikita Titov committed
1879
        num_machines : int, optional (default=1)
1880
            The number of machines for parallel learning application.
Nikita Titov's avatar
Nikita Titov committed
1881
1882
1883
1884
1885

        Returns
        -------
        self : Booster
            Booster with set network.
1886
1887
1888
1889
1890
1891
        """
        _safe_call(_LIB.LGBM_NetworkInit(c_str(machines),
                                         ctypes.c_int(local_listen_port),
                                         ctypes.c_int(listen_time_out),
                                         ctypes.c_int(num_machines)))
        self.network = True
Nikita Titov's avatar
Nikita Titov committed
1892
        return self
1893
1894

    def free_network(self):
Nikita Titov's avatar
Nikita Titov committed
1895
1896
1897
1898
1899
1900
1901
        """Free Booster's network.

        Returns
        -------
        self : Booster
            Booster with freed network.
        """
1902
1903
        _safe_call(_LIB.LGBM_NetworkFree())
        self.network = False
Nikita Titov's avatar
Nikita Titov committed
1904
        return self
1905

1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
    def trees_to_dataframe(self):
        """Parse the fitted model and return in an easy-to-read pandas DataFrame.

        Returns
        -------
        result : pandas DataFrame
            Returns a pandas DataFrame of the parsed model.
        """
        if not PANDAS_INSTALLED:
            raise LightGBMError('This method cannot be run without pandas installed')

        if self.num_trees() == 0:
            raise LightGBMError('There are no trees in this Booster and thus nothing to parse')

        def _is_split_node(tree):
            return 'split_index' in tree.keys()

        def create_node_record(tree, node_depth=1, tree_index=None,
                               feature_names=None, parent_node=None):

            def _get_node_index(tree, tree_index):
                tree_num = str(tree_index) + '-' if tree_index is not None else ''
                is_split = _is_split_node(tree)
                node_type = 'S' if is_split else 'L'
                # if a single node tree it won't have `leaf_index` so return 0
                node_num = str(tree.get('split_index' if is_split else 'leaf_index', 0))
                return tree_num + node_type + node_num

            def _get_split_feature(tree, feature_names):
                if _is_split_node(tree):
                    if feature_names is not None:
                        feature_name = feature_names[tree['split_feature']]
                    else:
                        feature_name = tree['split_feature']
                else:
                    feature_name = None
                return feature_name

            def _is_single_node_tree(tree):
1945
                return set(tree.keys()) == {'leaf_value'}
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020

            # Create the node record, and populate universal data members
            node = OrderedDict()
            node['tree_index'] = tree_index
            node['node_depth'] = node_depth
            node['node_index'] = _get_node_index(tree, tree_index)
            node['left_child'] = None
            node['right_child'] = None
            node['parent_index'] = parent_node
            node['split_feature'] = _get_split_feature(tree, feature_names)
            node['split_gain'] = None
            node['threshold'] = None
            node['decision_type'] = None
            node['missing_direction'] = None
            node['missing_type'] = None
            node['value'] = None
            node['weight'] = None
            node['count'] = None

            # Update values to reflect node type (leaf or split)
            if _is_split_node(tree):
                node['left_child'] = _get_node_index(tree['left_child'], tree_index)
                node['right_child'] = _get_node_index(tree['right_child'], tree_index)
                node['split_gain'] = tree['split_gain']
                node['threshold'] = tree['threshold']
                node['decision_type'] = tree['decision_type']
                node['missing_direction'] = 'left' if tree['default_left'] else 'right'
                node['missing_type'] = tree['missing_type']
                node['value'] = tree['internal_value']
                node['weight'] = tree['internal_weight']
                node['count'] = tree['internal_count']
            else:
                node['value'] = tree['leaf_value']
                if not _is_single_node_tree(tree):
                    node['weight'] = tree['leaf_weight']
                    node['count'] = tree['leaf_count']

            return node

        def tree_dict_to_node_list(tree, node_depth=1, tree_index=None,
                                   feature_names=None, parent_node=None):

            node = create_node_record(tree,
                                      node_depth=node_depth,
                                      tree_index=tree_index,
                                      feature_names=feature_names,
                                      parent_node=parent_node)

            res = [node]

            if _is_split_node(tree):
                # traverse the next level of the tree
                children = ['left_child', 'right_child']
                for child in children:
                    subtree_list = tree_dict_to_node_list(
                        tree[child],
                        node_depth=node_depth + 1,
                        tree_index=tree_index,
                        feature_names=feature_names,
                        parent_node=node['node_index'])
                    # In tree format, "subtree_list" is a list of node records (dicts),
                    # and we add node to the list.
                    res.extend(subtree_list)
            return res

        model_dict = self.dump_model()
        feature_names = model_dict['feature_names']
        model_list = []
        for tree in model_dict['tree_info']:
            model_list.extend(tree_dict_to_node_list(tree['tree_structure'],
                                                     tree_index=tree['tree_index'],
                                                     feature_names=feature_names))

        return DataFrame(model_list, columns=model_list[0].keys())

wxchan's avatar
wxchan committed
2021
    def set_train_data_name(self, name):
2022
2023
2024
2025
        """Set the name to the training Dataset.

        Parameters
        ----------
Nikita Titov's avatar
Nikita Titov committed
2026
2027
2028
2029
2030
2031
2032
        name : string
            Name for the training Dataset.

        Returns
        -------
        self : Booster
            Booster with set training Dataset name.
2033
        """
2034
        self._train_data_name = name
Nikita Titov's avatar
Nikita Titov committed
2035
        return self
wxchan's avatar
wxchan committed
2036
2037

    def add_valid(self, data, name):
2038
        """Add validation data.
wxchan's avatar
wxchan committed
2039
2040
2041
2042

        Parameters
        ----------
        data : Dataset
2043
2044
2045
            Validation data.
        name : string
            Name of validation data.
Nikita Titov's avatar
Nikita Titov committed
2046
2047
2048
2049
2050

        Returns
        -------
        self : Booster
            Booster with set validation data.
wxchan's avatar
wxchan committed
2051
        """
Guolin Ke's avatar
Guolin Ke committed
2052
        if not isinstance(data, Dataset):
2053
2054
            raise TypeError('Validation data should be Dataset instance, met {}'
                            .format(type(data).__name__))
Guolin Ke's avatar
Guolin Ke committed
2055
        if data._predictor is not self.__init_predictor:
2056
2057
            raise LightGBMError("Add validation data failed, "
                                "you should use same predictor for these data")
wxchan's avatar
wxchan committed
2058
2059
        _safe_call(_LIB.LGBM_BoosterAddValidData(
            self.handle,
wxchan's avatar
wxchan committed
2060
            data.construct().handle))
wxchan's avatar
wxchan committed
2061
2062
2063
2064
2065
        self.valid_sets.append(data)
        self.name_valid_sets.append(name)
        self.__num_dataset += 1
        self.__inner_predict_buffer.append(None)
        self.__is_predicted_cur_iter.append(False)
Nikita Titov's avatar
Nikita Titov committed
2066
        return self
wxchan's avatar
wxchan committed
2067
2068

    def reset_parameter(self, params):
2069
        """Reset parameters of Booster.
wxchan's avatar
wxchan committed
2070
2071
2072
2073

        Parameters
        ----------
        params : dict
2074
            New parameters for Booster.
Nikita Titov's avatar
Nikita Titov committed
2075
2076
2077
2078
2079

        Returns
        -------
        self : Booster
            Booster with new parameters.
wxchan's avatar
wxchan committed
2080
2081
2082
2083
2084
2085
        """
        params_str = param_dict_to_str(params)
        if params_str:
            _safe_call(_LIB.LGBM_BoosterResetParameter(
                self.handle,
                c_str(params_str)))
Guolin Ke's avatar
Guolin Ke committed
2086
        self.params.update(params)
Nikita Titov's avatar
Nikita Titov committed
2087
        return self
wxchan's avatar
wxchan committed
2088
2089

    def update(self, train_set=None, fobj=None):
Nikita Titov's avatar
Nikita Titov committed
2090
        """Update Booster for one iteration.
2091

wxchan's avatar
wxchan committed
2092
2093
        Parameters
        ----------
2094
2095
2096
2097
        train_set : Dataset or None, optional (default=None)
            Training data.
            If None, last training data is used.
        fobj : callable or None, optional (default=None)
wxchan's avatar
wxchan committed
2098
            Customized objective function.
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
            Should accept two parameters: preds, train_data,
            and return (grad, hess).

                preds : list or numpy 1-D array
                    The predicted values.
                train_data : Dataset
                    The training dataset.
                grad : list or numpy 1-D array
                    The value of the first order derivative (gradient) for each sample point.
                hess : list or numpy 1-D array
                    The value of the second order derivative (Hessian) for each sample point.
wxchan's avatar
wxchan committed
2110

2111
2112
            For multi-class task, the preds is group by class_id first, then group by row_id.
            If you want to get i-th row preds in j-th class, the access way is score[j * num_data + i]
2113
2114
            and you should group grad and hess in this way as well.

wxchan's avatar
wxchan committed
2115
2116
        Returns
        -------
2117
2118
        is_finished : bool
            Whether the update was successfully finished.
wxchan's avatar
wxchan committed
2119
        """
2120
        # need reset training data
2121
2122
2123
2124
2125
2126
        if train_set is None and self.train_set_version != self.train_set.version:
            train_set = self.train_set
            is_the_same_train_set = False
        else:
            is_the_same_train_set = train_set is self.train_set and self.train_set_version == train_set.version
        if train_set is not None and not is_the_same_train_set:
Guolin Ke's avatar
Guolin Ke committed
2127
            if not isinstance(train_set, Dataset):
2128
2129
                raise TypeError('Training data should be Dataset instance, met {}'
                                .format(type(train_set).__name__))
Guolin Ke's avatar
Guolin Ke committed
2130
            if train_set._predictor is not self.__init_predictor:
2131
2132
                raise LightGBMError("Replace training data failed, "
                                    "you should use same predictor for these data")
wxchan's avatar
wxchan committed
2133
2134
2135
            self.train_set = train_set
            _safe_call(_LIB.LGBM_BoosterResetTrainingData(
                self.handle,
wxchan's avatar
wxchan committed
2136
                self.train_set.construct().handle))
wxchan's avatar
wxchan committed
2137
            self.__inner_predict_buffer[0] = None
2138
            self.train_set_version = self.train_set.version
wxchan's avatar
wxchan committed
2139
2140
        is_finished = ctypes.c_int(0)
        if fobj is None:
2141
            if self.__set_objective_to_none:
2142
                raise LightGBMError('Cannot update due to null objective function.')
wxchan's avatar
wxchan committed
2143
2144
2145
            _safe_call(_LIB.LGBM_BoosterUpdateOneIter(
                self.handle,
                ctypes.byref(is_finished)))
wxchan's avatar
wxchan committed
2146
            self.__is_predicted_cur_iter = [False for _ in range_(self.__num_dataset)]
wxchan's avatar
wxchan committed
2147
2148
            return is_finished.value == 1
        else:
2149
            if not self.__set_objective_to_none:
Nikita Titov's avatar
Nikita Titov committed
2150
                self.reset_parameter({"objective": "none"}).__set_objective_to_none = True
wxchan's avatar
wxchan committed
2151
2152
2153
2154
            grad, hess = fobj(self.__inner_predict(0), self.train_set)
            return self.__boost(grad, hess)

    def __boost(self, grad, hess):
2155
        """Boost Booster for one iteration with customized gradient statistics.
Nikita Titov's avatar
Nikita Titov committed
2156

Nikita Titov's avatar
Nikita Titov committed
2157
2158
2159
2160
2161
        .. note::

            For multi-class task, the score is group by class_id first, then group by row_id.
            If you want to get i-th row score in j-th class, the access way is score[j * num_data + i]
            and you should group grad and hess in this way as well.
2162

wxchan's avatar
wxchan committed
2163
2164
        Parameters
        ----------
2165
        grad : list or numpy 1-D array
Nikita Titov's avatar
Nikita Titov committed
2166
            The first order derivative (gradient).
2167
        hess : list or numpy 1-D array
Nikita Titov's avatar
Nikita Titov committed
2168
            The second order derivative (Hessian).
wxchan's avatar
wxchan committed
2169
2170
2171

        Returns
        -------
Nikita Titov's avatar
Nikita Titov committed
2172
2173
        is_finished : bool
            Whether the boost was successfully finished.
wxchan's avatar
wxchan committed
2174
        """
2175
2176
        grad = list_to_1d_numpy(grad, name='gradient')
        hess = list_to_1d_numpy(hess, name='hessian')
2177
2178
        assert grad.flags.c_contiguous
        assert hess.flags.c_contiguous
wxchan's avatar
wxchan committed
2179
        if len(grad) != len(hess):
2180
2181
            raise ValueError("Lengths of gradient({}) and hessian({}) don't match"
                             .format(len(grad), len(hess)))
wxchan's avatar
wxchan committed
2182
2183
2184
2185
2186
2187
        is_finished = ctypes.c_int(0)
        _safe_call(_LIB.LGBM_BoosterUpdateOneIterCustom(
            self.handle,
            grad.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
            hess.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
            ctypes.byref(is_finished)))
wxchan's avatar
wxchan committed
2188
        self.__is_predicted_cur_iter = [False for _ in range_(self.__num_dataset)]
wxchan's avatar
wxchan committed
2189
2190
2191
        return is_finished.value == 1

    def rollback_one_iter(self):
Nikita Titov's avatar
Nikita Titov committed
2192
2193
2194
2195
2196
2197
2198
        """Rollback one iteration.

        Returns
        -------
        self : Booster
            Booster with rolled back one iteration.
        """
wxchan's avatar
wxchan committed
2199
2200
        _safe_call(_LIB.LGBM_BoosterRollbackOneIter(
            self.handle))
wxchan's avatar
wxchan committed
2201
        self.__is_predicted_cur_iter = [False for _ in range_(self.__num_dataset)]
Nikita Titov's avatar
Nikita Titov committed
2202
        return self
wxchan's avatar
wxchan committed
2203
2204

    def current_iteration(self):
2205
2206
2207
2208
2209
2210
2211
        """Get the index of the current iteration.

        Returns
        -------
        cur_iter : int
            The index of the current iteration.
        """
Guolin Ke's avatar
Guolin Ke committed
2212
        out_cur_iter = ctypes.c_int(0)
wxchan's avatar
wxchan committed
2213
2214
2215
2216
2217
        _safe_call(_LIB.LGBM_BoosterGetCurrentIteration(
            self.handle,
            ctypes.byref(out_cur_iter)))
        return out_cur_iter.value

2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
    def num_model_per_iteration(self):
        """Get number of models per iteration.

        Returns
        -------
        model_per_iter : int
            The number of models per iteration.
        """
        model_per_iter = ctypes.c_int(0)
        _safe_call(_LIB.LGBM_BoosterNumModelPerIteration(
            self.handle,
            ctypes.byref(model_per_iter)))
        return model_per_iter.value

    def num_trees(self):
        """Get number of weak sub-models.

        Returns
        -------
        num_trees : int
            The number of weak sub-models.
        """
        num_trees = ctypes.c_int(0)
        _safe_call(_LIB.LGBM_BoosterNumberOfTotalModel(
            self.handle,
            ctypes.byref(num_trees)))
        return num_trees.value

2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
    def upper_bound(self):
        """Get upper bound value of a model.

        Returns
        -------
        upper_bound : double
            Upper bound value of the model.
        """
        ret = ctypes.c_double(0)
        _safe_call(_LIB.LGBM_BoosterGetUpperBoundValue(
            self.handle,
            ctypes.byref(ret)))
        return ret.value

    def lower_bound(self):
        """Get lower bound value of a model.

        Returns
        -------
        lower_bound : double
            Lower bound value of the model.
        """
        ret = ctypes.c_double(0)
        _safe_call(_LIB.LGBM_BoosterGetLowerBoundValue(
            self.handle,
            ctypes.byref(ret)))
        return ret.value

wxchan's avatar
wxchan committed
2274
    def eval(self, data, name, feval=None):
2275
        """Evaluate for data.
wxchan's avatar
wxchan committed
2276
2277
2278

        Parameters
        ----------
2279
2280
2281
2282
2283
        data : Dataset
            Data for the evaluating.
        name : string
            Name of the data.
        feval : callable or None, optional (default=None)
2284
            Customized evaluation function.
2285
            Should accept two parameters: preds, eval_data,
2286
            and return (eval_name, eval_result, is_higher_better) or list of such tuples.
2287
2288
2289
2290
2291
2292

                preds : list or numpy 1-D array
                    The predicted values.
                eval_data : Dataset
                    The evaluation dataset.
                eval_name : string
2293
                    The name of evaluation function (without whitespaces).
2294
2295
2296
2297
2298
                eval_result : float
                    The eval result.
                is_higher_better : bool
                    Is eval result higher better, e.g. AUC is ``is_higher_better``.

2299
2300
            For multi-class task, the preds is group by class_id first, then group by row_id.
            If you want to get i-th row preds in j-th class, the access way is preds[j * num_data + i].
2301

wxchan's avatar
wxchan committed
2302
2303
        Returns
        -------
Nikita Titov's avatar
Nikita Titov committed
2304
        result : list
2305
            List with evaluation results.
wxchan's avatar
wxchan committed
2306
        """
Guolin Ke's avatar
Guolin Ke committed
2307
2308
        if not isinstance(data, Dataset):
            raise TypeError("Can only eval for Dataset instance")
wxchan's avatar
wxchan committed
2309
2310
2311
2312
        data_idx = -1
        if data is self.train_set:
            data_idx = 0
        else:
wxchan's avatar
wxchan committed
2313
            for i in range_(len(self.valid_sets)):
wxchan's avatar
wxchan committed
2314
2315
2316
                if data is self.valid_sets[i]:
                    data_idx = i + 1
                    break
2317
        # need to push new valid data
wxchan's avatar
wxchan committed
2318
2319
2320
2321
2322
2323
2324
        if data_idx == -1:
            self.add_valid(data, name)
            data_idx = self.__num_dataset - 1

        return self.__inner_eval(name, data_idx, feval)

    def eval_train(self, feval=None):
2325
        """Evaluate for training data.
wxchan's avatar
wxchan committed
2326
2327
2328

        Parameters
        ----------
2329
        feval : callable or None, optional (default=None)
2330
            Customized evaluation function.
2331
2332
            Should accept two parameters: preds, train_data,
            and return (eval_name, eval_result, is_higher_better) or list of such tuples.
2333
2334
2335
2336
2337
2338

                preds : list or numpy 1-D array
                    The predicted values.
                train_data : Dataset
                    The training dataset.
                eval_name : string
2339
                    The name of evaluation function (without whitespaces).
2340
2341
2342
2343
2344
                eval_result : float
                    The eval result.
                is_higher_better : bool
                    Is eval result higher better, e.g. AUC is ``is_higher_better``.

2345
2346
            For multi-class task, the preds is group by class_id first, then group by row_id.
            If you want to get i-th row preds in j-th class, the access way is preds[j * num_data + i].
wxchan's avatar
wxchan committed
2347
2348
2349

        Returns
        -------
Nikita Titov's avatar
Nikita Titov committed
2350
        result : list
2351
            List with evaluation results.
wxchan's avatar
wxchan committed
2352
        """
2353
        return self.__inner_eval(self._train_data_name, 0, feval)
wxchan's avatar
wxchan committed
2354
2355

    def eval_valid(self, feval=None):
2356
        """Evaluate for validation data.
wxchan's avatar
wxchan committed
2357
2358
2359

        Parameters
        ----------
2360
        feval : callable or None, optional (default=None)
2361
            Customized evaluation function.
2362
            Should accept two parameters: preds, valid_data,
2363
            and return (eval_name, eval_result, is_higher_better) or list of such tuples.
2364
2365
2366
2367
2368
2369

                preds : list or numpy 1-D array
                    The predicted values.
                valid_data : Dataset
                    The validation dataset.
                eval_name : string
2370
                    The name of evaluation function (without whitespaces).
2371
2372
2373
2374
2375
                eval_result : float
                    The eval result.
                is_higher_better : bool
                    Is eval result higher better, e.g. AUC is ``is_higher_better``.

2376
2377
            For multi-class task, the preds is group by class_id first, then group by row_id.
            If you want to get i-th row preds in j-th class, the access way is preds[j * num_data + i].
wxchan's avatar
wxchan committed
2378
2379
2380

        Returns
        -------
Nikita Titov's avatar
Nikita Titov committed
2381
        result : list
2382
            List with evaluation results.
wxchan's avatar
wxchan committed
2383
        """
wxchan's avatar
wxchan committed
2384
        return [item for i in range_(1, self.__num_dataset)
wxchan's avatar
wxchan committed
2385
                for item in self.__inner_eval(self.name_valid_sets[i - 1], i, feval)]
wxchan's avatar
wxchan committed
2386

2387
    def save_model(self, filename, num_iteration=None, start_iteration=0):
2388
        """Save Booster to file.
wxchan's avatar
wxchan committed
2389
2390
2391

        Parameters
        ----------
2392
2393
        filename : string
            Filename to save Booster.
2394
2395
2396
2397
        num_iteration : int or None, optional (default=None)
            Index of the iteration that should be saved.
            If None, if the best iteration exists, it is saved; otherwise, all iterations are saved.
            If <= 0, all iterations are saved.
Nikita Titov's avatar
Nikita Titov committed
2398
        start_iteration : int, optional (default=0)
2399
            Start index of the iteration that should be saved.
Nikita Titov's avatar
Nikita Titov committed
2400
2401
2402
2403
2404

        Returns
        -------
        self : Booster
            Returns self.
wxchan's avatar
wxchan committed
2405
        """
2406
        if num_iteration is None:
2407
            num_iteration = self.best_iteration
wxchan's avatar
wxchan committed
2408
2409
        _safe_call(_LIB.LGBM_BoosterSaveModel(
            self.handle,
2410
            ctypes.c_int(start_iteration),
Guolin Ke's avatar
Guolin Ke committed
2411
            ctypes.c_int(num_iteration),
wxchan's avatar
wxchan committed
2412
            c_str(filename)))
2413
        _dump_pandas_categorical(self.pandas_categorical, filename)
Nikita Titov's avatar
Nikita Titov committed
2414
        return self
wxchan's avatar
wxchan committed
2415

2416
    def shuffle_models(self, start_iteration=0, end_iteration=-1):
2417
        """Shuffle models.
Nikita Titov's avatar
Nikita Titov committed
2418

2419
2420
2421
        Parameters
        ----------
        start_iteration : int, optional (default=0)
2422
            The first iteration that will be shuffled.
2423
2424
        end_iteration : int, optional (default=-1)
            The last iteration that will be shuffled.
2425
            If <= 0, means the last available iteration.
2426

Nikita Titov's avatar
Nikita Titov committed
2427
2428
2429
2430
        Returns
        -------
        self : Booster
            Booster with shuffled models.
2431
        """
2432
2433
        _safe_call(_LIB.LGBM_BoosterShuffleModels(
            self.handle,
Guolin Ke's avatar
Guolin Ke committed
2434
2435
            ctypes.c_int(start_iteration),
            ctypes.c_int(end_iteration)))
Nikita Titov's avatar
Nikita Titov committed
2436
        return self
2437
2438
2439
2440
2441
2442

    def model_from_string(self, model_str, verbose=True):
        """Load Booster from a string.

        Parameters
        ----------
Nikita Titov's avatar
Nikita Titov committed
2443
        model_str : string
2444
            Model will be loaded from this string.
Nikita Titov's avatar
Nikita Titov committed
2445
2446
        verbose : bool, optional (default=True)
            Whether to print messages while loading model.
2447
2448
2449

        Returns
        -------
Nikita Titov's avatar
Nikita Titov committed
2450
        self : Booster
2451
2452
            Loaded Booster object.
        """
2453
2454
2455
2456
        if self.handle is not None:
            _safe_call(_LIB.LGBM_BoosterFree(self.handle))
        self._free_buffer()
        self.handle = ctypes.c_void_p()
2457
2458
2459
2460
2461
2462
2463
2464
2465
        out_num_iterations = ctypes.c_int(0)
        _safe_call(_LIB.LGBM_BoosterLoadModelFromString(
            c_str(model_str),
            ctypes.byref(out_num_iterations),
            ctypes.byref(self.handle)))
        out_num_class = ctypes.c_int(0)
        _safe_call(_LIB.LGBM_BoosterGetNumClasses(
            self.handle,
            ctypes.byref(out_num_class)))
2466
        if verbose:
Nikita Titov's avatar
Nikita Titov committed
2467
            print('Finished loading model, total used %d iterations' % int(out_num_iterations.value))
2468
        self.__num_class = out_num_class.value
2469
        self.pandas_categorical = _load_pandas_categorical(model_str=model_str)
2470
2471
2472
2473
        return self

    def model_to_string(self, num_iteration=None, start_iteration=0):
        """Save Booster to string.
2474

2475
2476
2477
2478
2479
2480
        Parameters
        ----------
        num_iteration : int or None, optional (default=None)
            Index of the iteration that should be saved.
            If None, if the best iteration exists, it is saved; otherwise, all iterations are saved.
            If <= 0, all iterations are saved.
Nikita Titov's avatar
Nikita Titov committed
2481
        start_iteration : int, optional (default=0)
2482
2483
2484
2485
            Start index of the iteration that should be saved.

        Returns
        -------
Nikita Titov's avatar
Nikita Titov committed
2486
        str_repr : string
2487
2488
            String representation of Booster.
        """
2489
        if num_iteration is None:
2490
2491
            num_iteration = self.best_iteration
        buffer_len = 1 << 20
2492
        tmp_out_len = ctypes.c_int64(0)
2493
2494
2495
2496
        string_buffer = ctypes.create_string_buffer(buffer_len)
        ptr_string_buffer = ctypes.c_char_p(*[ctypes.addressof(string_buffer)])
        _safe_call(_LIB.LGBM_BoosterSaveModelToString(
            self.handle,
2497
            ctypes.c_int(start_iteration),
2498
            ctypes.c_int(num_iteration),
2499
            ctypes.c_int64(buffer_len),
2500
2501
2502
            ctypes.byref(tmp_out_len),
            ptr_string_buffer))
        actual_len = tmp_out_len.value
2503
        # if buffer length is not long enough, re-allocate a buffer
2504
2505
2506
2507
2508
        if actual_len > buffer_len:
            string_buffer = ctypes.create_string_buffer(actual_len)
            ptr_string_buffer = ctypes.c_char_p(*[ctypes.addressof(string_buffer)])
            _safe_call(_LIB.LGBM_BoosterSaveModelToString(
                self.handle,
2509
                ctypes.c_int(start_iteration),
2510
                ctypes.c_int(num_iteration),
2511
                ctypes.c_int64(actual_len),
2512
2513
                ctypes.byref(tmp_out_len),
                ptr_string_buffer))
2514
2515
2516
        ret = string_buffer.value.decode()
        ret += _dump_pandas_categorical(self.pandas_categorical)
        return ret
2517

2518
    def dump_model(self, num_iteration=None, start_iteration=0):
Nikita Titov's avatar
Nikita Titov committed
2519
        """Dump Booster to JSON format.
wxchan's avatar
wxchan committed
2520

2521
2522
        Parameters
        ----------
2523
2524
2525
2526
        num_iteration : int or None, optional (default=None)
            Index of the iteration that should be dumped.
            If None, if the best iteration exists, it is dumped; otherwise, all iterations are dumped.
            If <= 0, all iterations are dumped.
Nikita Titov's avatar
Nikita Titov committed
2527
        start_iteration : int, optional (default=0)
2528
            Start index of the iteration that should be dumped.
2529

wxchan's avatar
wxchan committed
2530
2531
        Returns
        -------
2532
        json_repr : dict
Nikita Titov's avatar
Nikita Titov committed
2533
            JSON format of Booster.
wxchan's avatar
wxchan committed
2534
        """
2535
        if num_iteration is None:
2536
            num_iteration = self.best_iteration
wxchan's avatar
wxchan committed
2537
        buffer_len = 1 << 20
2538
        tmp_out_len = ctypes.c_int64(0)
wxchan's avatar
wxchan committed
2539
2540
2541
2542
        string_buffer = ctypes.create_string_buffer(buffer_len)
        ptr_string_buffer = ctypes.c_char_p(*[ctypes.addressof(string_buffer)])
        _safe_call(_LIB.LGBM_BoosterDumpModel(
            self.handle,
2543
            ctypes.c_int(start_iteration),
Guolin Ke's avatar
Guolin Ke committed
2544
            ctypes.c_int(num_iteration),
2545
            ctypes.c_int64(buffer_len),
wxchan's avatar
wxchan committed
2546
            ctypes.byref(tmp_out_len),
Guolin Ke's avatar
Guolin Ke committed
2547
            ptr_string_buffer))
wxchan's avatar
wxchan committed
2548
        actual_len = tmp_out_len.value
2549
        # if buffer length is not long enough, reallocate a buffer
wxchan's avatar
wxchan committed
2550
2551
2552
2553
2554
        if actual_len > buffer_len:
            string_buffer = ctypes.create_string_buffer(actual_len)
            ptr_string_buffer = ctypes.c_char_p(*[ctypes.addressof(string_buffer)])
            _safe_call(_LIB.LGBM_BoosterDumpModel(
                self.handle,
2555
                ctypes.c_int(start_iteration),
Guolin Ke's avatar
Guolin Ke committed
2556
                ctypes.c_int(num_iteration),
2557
                ctypes.c_int64(actual_len),
wxchan's avatar
wxchan committed
2558
                ctypes.byref(tmp_out_len),
Guolin Ke's avatar
Guolin Ke committed
2559
                ptr_string_buffer))
2560
2561
2562
2563
        ret = json.loads(string_buffer.value.decode())
        ret['pandas_categorical'] = json.loads(json.dumps(self.pandas_categorical,
                                                          default=json_default_with_numpy))
        return ret
wxchan's avatar
wxchan committed
2564

2565
2566
    def predict(self, data, num_iteration=None,
                raw_score=False, pred_leaf=False, pred_contrib=False,
2567
                data_has_header=False, is_reshape=True, **kwargs):
2568
        """Make a prediction.
wxchan's avatar
wxchan committed
2569
2570
2571

        Parameters
        ----------
2572
        data : string, numpy array, pandas DataFrame, H2O DataTable's Frame or scipy.sparse
2573
2574
            Data source for prediction.
            If string, it represents the path to txt file.
2575
2576
2577
2578
        num_iteration : int or None, optional (default=None)
            Limit number of iterations in the prediction.
            If None, if the best iteration exists, it is used; otherwise, all iterations are used.
            If <= 0, all iterations are used (no limits).
2579
2580
2581
2582
        raw_score : bool, optional (default=False)
            Whether to predict raw scores.
        pred_leaf : bool, optional (default=False)
            Whether to predict leaf index.
2583
2584
        pred_contrib : bool, optional (default=False)
            Whether to predict feature contributions.
2585

Nikita Titov's avatar
Nikita Titov committed
2586
2587
2588
2589
2590
2591
2592
            .. note::

                If you want to get more explanations for your model's predictions using SHAP values,
                like SHAP interaction values,
                you can install the shap package (https://github.com/slundberg/shap).
                Note that unlike the shap package, with ``pred_contrib`` we return a matrix with an extra
                column, where the last column is the expected value.
2593

2594
2595
2596
2597
2598
        data_has_header : bool, optional (default=False)
            Whether the data has header.
            Used only if data is string.
        is_reshape : bool, optional (default=True)
            If True, result is reshaped to [nrow, ncol].
2599
2600
        **kwargs
            Other parameters for the prediction.
wxchan's avatar
wxchan committed
2601
2602
2603

        Returns
        -------
2604
2605
        result : numpy array
            Prediction result.
wxchan's avatar
wxchan committed
2606
        """
2607
        predictor = self._to_predictor(copy.deepcopy(kwargs))
2608
        if num_iteration is None:
2609
            num_iteration = self.best_iteration
2610
2611
2612
        return predictor.predict(data, num_iteration,
                                 raw_score, pred_leaf, pred_contrib,
                                 data_has_header, is_reshape)
wxchan's avatar
wxchan committed
2613

2614
    def refit(self, data, label, decay_rate=0.9, **kwargs):
Guolin Ke's avatar
Guolin Ke committed
2615
2616
2617
2618
        """Refit the existing Booster by new data.

        Parameters
        ----------
2619
        data : string, numpy array, pandas DataFrame, H2O DataTable's Frame or scipy.sparse
Guolin Ke's avatar
Guolin Ke committed
2620
2621
            Data source for refit.
            If string, it represents the path to txt file.
2622
        label : list, numpy 1-D array or pandas Series / one-column DataFrame
Guolin Ke's avatar
Guolin Ke committed
2623
2624
            Label for refit.
        decay_rate : float, optional (default=0.9)
2625
2626
            Decay rate of refit,
            will use ``leaf_output = decay_rate * old_leaf_output + (1.0 - decay_rate) * new_leaf_output`` to refit trees.
2627
2628
        **kwargs
            Other parameters for refit.
2629
            These parameters will be passed to ``predict`` method.
Guolin Ke's avatar
Guolin Ke committed
2630
2631
2632
2633
2634
2635

        Returns
        -------
        result : Booster
            Refitted Booster.
        """
2636
2637
        if self.__set_objective_to_none:
            raise LightGBMError('Cannot refit due to null objective function.')
2638
        predictor = self._to_predictor(copy.deepcopy(kwargs))
2639
        leaf_preds = predictor.predict(data, -1, pred_leaf=True)
2640
        nrow, ncol = leaf_preds.shape
2641
        train_set = Dataset(data, label, silent=True)
2642
2643
        new_params = copy.deepcopy(self.params)
        new_params['refit_decay_rate'] = decay_rate
2644
        new_booster = Booster(new_params, train_set)
Guolin Ke's avatar
Guolin Ke committed
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
        # Copy models
        _safe_call(_LIB.LGBM_BoosterMerge(
            new_booster.handle,
            predictor.handle))
        leaf_preds = leaf_preds.reshape(-1)
        ptr_data, type_ptr_data, _ = c_int_array(leaf_preds)
        _safe_call(_LIB.LGBM_BoosterRefit(
            new_booster.handle,
            ptr_data,
            ctypes.c_int(nrow),
            ctypes.c_int(ncol)))
2656
2657
        new_booster.network = self.network
        new_booster.__attr = self.__attr.copy()
Guolin Ke's avatar
Guolin Ke committed
2658
2659
        return new_booster

2660
    def get_leaf_output(self, tree_id, leaf_id):
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
        """Get the output of a leaf.

        Parameters
        ----------
        tree_id : int
            The index of the tree.
        leaf_id : int
            The index of the leaf in the tree.

        Returns
        -------
        result : float
            The output of the leaf.
        """
2675
2676
2677
2678
2679
2680
2681
2682
        ret = ctypes.c_double(0)
        _safe_call(_LIB.LGBM_BoosterGetLeafValue(
            self.handle,
            ctypes.c_int(tree_id),
            ctypes.c_int(leaf_id),
            ctypes.byref(ret)))
        return ret.value

2683
    def _to_predictor(self, pred_parameter=None):
2684
        """Convert to predictor."""
2685
        predictor = _InnerPredictor(booster_handle=self.handle, pred_parameter=pred_parameter)
2686
        predictor.pandas_categorical = self.pandas_categorical
wxchan's avatar
wxchan committed
2687
2688
        return predictor

2689
    def num_feature(self):
2690
2691
2692
2693
2694
2695
2696
        """Get number of features.

        Returns
        -------
        num_feature : int
            The number of features.
        """
2697
2698
2699
2700
2701
2702
        out_num_feature = ctypes.c_int(0)
        _safe_call(_LIB.LGBM_BoosterGetNumFeature(
            self.handle,
            ctypes.byref(out_num_feature)))
        return out_num_feature.value

wxchan's avatar
wxchan committed
2703
    def feature_name(self):
2704
        """Get names of features.
wxchan's avatar
wxchan committed
2705
2706
2707

        Returns
        -------
2708
2709
        result : list
            List with names of features.
wxchan's avatar
wxchan committed
2710
        """
2711
        num_feature = self.num_feature()
2712
        # Get name of features
wxchan's avatar
wxchan committed
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
        tmp_out_len = ctypes.c_int(0)
        string_buffers = [ctypes.create_string_buffer(255) for i in range_(num_feature)]
        ptr_string_buffers = (ctypes.c_char_p * num_feature)(*map(ctypes.addressof, string_buffers))
        _safe_call(_LIB.LGBM_BoosterGetFeatureNames(
            self.handle,
            ctypes.byref(tmp_out_len),
            ptr_string_buffers))
        if num_feature != tmp_out_len.value:
            raise ValueError("Length of feature names doesn't equal with num_feature")
        return [string_buffers[i].value.decode() for i in range_(num_feature)]

2724
    def feature_importance(self, importance_type='split', iteration=None):
2725
        """Get feature importances.
2726

2727
2728
        Parameters
        ----------
2729
2730
2731
2732
        importance_type : string, optional (default="split")
            How the importance is calculated.
            If "split", result contains numbers of times the feature is used in a model.
            If "gain", result contains total gains of splits which use the feature.
2733
2734
2735
2736
        iteration : int or None, optional (default=None)
            Limit number of iterations in the feature importance calculation.
            If None, if the best iteration exists, it is used; otherwise, all trees are used.
            If <= 0, all trees are used (no limits).
2737

2738
2739
        Returns
        -------
2740
2741
        result : numpy array
            Array with feature importances.
2742
        """
2743
2744
        if iteration is None:
            iteration = self.best_iteration
2745
2746
2747
2748
2749
2750
        if importance_type == "split":
            importance_type_int = 0
        elif importance_type == "gain":
            importance_type_int = 1
        else:
            importance_type_int = -1
Nikita Titov's avatar
Nikita Titov committed
2751
        result = np.zeros(self.num_feature(), dtype=np.float64)
2752
2753
2754
2755
2756
2757
        _safe_call(_LIB.LGBM_BoosterFeatureImportance(
            self.handle,
            ctypes.c_int(iteration),
            ctypes.c_int(importance_type_int),
            result.ctypes.data_as(ctypes.POINTER(ctypes.c_double))))
        if importance_type_int == 0:
2758
            return result.astype(np.int32)
2759
2760
        else:
            return result
2761

2762
2763
2764
2765
2766
2767
2768
2769
2770
    def get_split_value_histogram(self, feature, bins=None, xgboost_style=False):
        """Get split value histogram for the specified feature.

        Parameters
        ----------
        feature : int or string
            The feature name or index the histogram is calculated for.
            If int, interpreted as index.
            If string, interpreted as name.
2771

Nikita Titov's avatar
Nikita Titov committed
2772
2773
2774
            .. warning::

                Categorical features are not supported.
2775

2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
        bins : int, string or None, optional (default=None)
            The maximum number of bins.
            If None, or int and > number of unique split values and ``xgboost_style=True``,
            the number of bins equals number of unique split values.
            If string, it should be one from the list of the supported values by ``numpy.histogram()`` function.
        xgboost_style : bool, optional (default=False)
            Whether the returned result should be in the same form as it is in XGBoost.
            If False, the returned value is tuple of 2 numpy arrays as it is in ``numpy.histogram()`` function.
            If True, the returned value is matrix, in which the first column is the right edges of non-empty bins
            and the second one is the histogram values.

        Returns
        -------
        result_tuple : tuple of 2 numpy arrays
            If ``xgboost_style=False``, the values of the histogram of used splitting values for the specified feature
            and the bin edges.
        result_array_like : numpy array or pandas DataFrame (if pandas is installed)
            If ``xgboost_style=True``, the histogram of used splitting values for the specified feature.
        """
        def add(root):
            """Recursively add thresholds."""
            if 'split_index' in root:  # non-leaf
                if feature_names is not None and isinstance(feature, string_type):
                    split_feature = feature_names[root['split_feature']]
                else:
                    split_feature = root['split_feature']
                if split_feature == feature:
2803
2804
2805
2806
                    if isinstance(root['threshold'], string_type):
                        raise LightGBMError('Cannot compute split value histogram for the categorical feature')
                    else:
                        values.append(root['threshold'])
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
                add(root['left_child'])
                add(root['right_child'])

        model = self.dump_model()
        feature_names = model.get('feature_names')
        tree_infos = model['tree_info']
        values = []
        for tree_info in tree_infos:
            add(tree_info['tree_structure'])

        if bins is None or isinstance(bins, integer_types) and xgboost_style:
            n_unique = len(np.unique(values))
            bins = max(min(n_unique, bins) if bins is not None else n_unique, 1)
        hist, bin_edges = np.histogram(values, bins=bins)
        if xgboost_style:
            ret = np.column_stack((bin_edges[1:], hist))
            ret = ret[ret[:, 1] > 0]
            if PANDAS_INSTALLED:
                return DataFrame(ret, columns=['SplitValue', 'Count'])
            else:
                return ret
        else:
            return hist, bin_edges

wxchan's avatar
wxchan committed
2831
    def __inner_eval(self, data_name, data_idx, feval=None):
2832
        """Evaluate training or validation data."""
wxchan's avatar
wxchan committed
2833
        if data_idx >= self.__num_dataset:
2834
            raise ValueError("Data_idx should be smaller than number of dataset")
wxchan's avatar
wxchan committed
2835
2836
2837
        self.__get_eval_info()
        ret = []
        if self.__num_inner_eval > 0:
2838
            result = np.zeros(self.__num_inner_eval, dtype=np.float64)
Guolin Ke's avatar
Guolin Ke committed
2839
            tmp_out_len = ctypes.c_int(0)
wxchan's avatar
wxchan committed
2840
2841
            _safe_call(_LIB.LGBM_BoosterGetEval(
                self.handle,
Guolin Ke's avatar
Guolin Ke committed
2842
                ctypes.c_int(data_idx),
wxchan's avatar
wxchan committed
2843
                ctypes.byref(tmp_out_len),
Guolin Ke's avatar
Guolin Ke committed
2844
                result.ctypes.data_as(ctypes.POINTER(ctypes.c_double))))
wxchan's avatar
wxchan committed
2845
            if tmp_out_len.value != self.__num_inner_eval:
2846
                raise ValueError("Wrong length of eval results")
wxchan's avatar
wxchan committed
2847
            for i in range_(self.__num_inner_eval):
2848
2849
                ret.append((data_name, self.__name_inner_eval[i],
                            result[i], self.__higher_better_inner_eval[i]))
wxchan's avatar
wxchan committed
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
        if feval is not None:
            if data_idx == 0:
                cur_data = self.train_set
            else:
                cur_data = self.valid_sets[data_idx - 1]
            feval_ret = feval(self.__inner_predict(data_idx), cur_data)
            if isinstance(feval_ret, list):
                for eval_name, val, is_higher_better in feval_ret:
                    ret.append((data_name, eval_name, val, is_higher_better))
            else:
                eval_name, val, is_higher_better = feval_ret
                ret.append((data_name, eval_name, val, is_higher_better))
        return ret

    def __inner_predict(self, data_idx):
2865
        """Predict for training and validation dataset."""
wxchan's avatar
wxchan committed
2866
        if data_idx >= self.__num_dataset:
2867
            raise ValueError("Data_idx should be smaller than number of dataset")
wxchan's avatar
wxchan committed
2868
2869
2870
2871
2872
        if self.__inner_predict_buffer[data_idx] is None:
            if data_idx == 0:
                n_preds = self.train_set.num_data() * self.__num_class
            else:
                n_preds = self.valid_sets[data_idx - 1].num_data() * self.__num_class
2873
            self.__inner_predict_buffer[data_idx] = np.zeros(n_preds, dtype=np.float64)
2874
        # avoid to predict many time in one iteration
wxchan's avatar
wxchan committed
2875
2876
        if not self.__is_predicted_cur_iter[data_idx]:
            tmp_out_len = ctypes.c_int64(0)
Guolin Ke's avatar
Guolin Ke committed
2877
            data_ptr = self.__inner_predict_buffer[data_idx].ctypes.data_as(ctypes.POINTER(ctypes.c_double))
wxchan's avatar
wxchan committed
2878
2879
            _safe_call(_LIB.LGBM_BoosterGetPredict(
                self.handle,
Guolin Ke's avatar
Guolin Ke committed
2880
                ctypes.c_int(data_idx),
wxchan's avatar
wxchan committed
2881
2882
2883
                ctypes.byref(tmp_out_len),
                data_ptr))
            if tmp_out_len.value != len(self.__inner_predict_buffer[data_idx]):
2884
                raise ValueError("Wrong length of predict results for data %d" % (data_idx))
wxchan's avatar
wxchan committed
2885
2886
2887
2888
            self.__is_predicted_cur_iter[data_idx] = True
        return self.__inner_predict_buffer[data_idx]

    def __get_eval_info(self):
2889
        """Get inner evaluation count and names."""
wxchan's avatar
wxchan committed
2890
2891
        if self.__need_reload_eval_info:
            self.__need_reload_eval_info = False
Guolin Ke's avatar
Guolin Ke committed
2892
            out_num_eval = ctypes.c_int(0)
2893
            # Get num of inner evals
wxchan's avatar
wxchan committed
2894
2895
2896
2897
2898
            _safe_call(_LIB.LGBM_BoosterGetEvalCounts(
                self.handle,
                ctypes.byref(out_num_eval)))
            self.__num_inner_eval = out_num_eval.value
            if self.__num_inner_eval > 0:
2899
                # Get name of evals
Guolin Ke's avatar
Guolin Ke committed
2900
                tmp_out_len = ctypes.c_int(0)
wxchan's avatar
wxchan committed
2901
                string_buffers = [ctypes.create_string_buffer(255) for i in range_(self.__num_inner_eval)]
wxchan's avatar
wxchan committed
2902
                ptr_string_buffers = (ctypes.c_char_p * self.__num_inner_eval)(*map(ctypes.addressof, string_buffers))
wxchan's avatar
wxchan committed
2903
2904
2905
2906
2907
                _safe_call(_LIB.LGBM_BoosterGetEvalNames(
                    self.handle,
                    ctypes.byref(tmp_out_len),
                    ptr_string_buffers))
                if self.__num_inner_eval != tmp_out_len.value:
2908
                    raise ValueError("Length of eval names doesn't equal with num_evals")
2909
                self.__name_inner_eval = \
wxchan's avatar
wxchan committed
2910
                    [string_buffers[i].value.decode() for i in range_(self.__num_inner_eval)]
2911
                self.__higher_better_inner_eval = \
2912
                    [name.startswith(('auc', 'ndcg@', 'map@')) for name in self.__name_inner_eval]
2913

wxchan's avatar
wxchan committed
2914
    def attr(self, key):
2915
        """Get attribute string from the Booster.
wxchan's avatar
wxchan committed
2916
2917
2918

        Parameters
        ----------
2919
2920
        key : string
            The name of the attribute.
wxchan's avatar
wxchan committed
2921
2922
2923

        Returns
        -------
2924
2925
        value : string or None
            The attribute value.
Nikita Titov's avatar
Nikita Titov committed
2926
            Returns None if attribute does not exist.
wxchan's avatar
wxchan committed
2927
        """
2928
        return self.__attr.get(key, None)
wxchan's avatar
wxchan committed
2929
2930

    def set_attr(self, **kwargs):
2931
        """Set attributes to the Booster.
wxchan's avatar
wxchan committed
2932
2933
2934
2935

        Parameters
        ----------
        **kwargs
2936
2937
            The attributes to set.
            Setting a value to None deletes an attribute.
Nikita Titov's avatar
Nikita Titov committed
2938
2939
2940
2941

        Returns
        -------
        self : Booster
2942
            Booster with set attributes.
wxchan's avatar
wxchan committed
2943
2944
2945
        """
        for key, value in kwargs.items():
            if value is not None:
wxchan's avatar
wxchan committed
2946
                if not isinstance(value, string_type):
Nikita Titov's avatar
Nikita Titov committed
2947
                    raise ValueError("Only string values are accepted")
wxchan's avatar
wxchan committed
2948
2949
2950
                self.__attr[key] = value
            else:
                self.__attr.pop(key, None)
Nikita Titov's avatar
Nikita Titov committed
2951
        return self