basic.py 117 KB
Newer Older
wxchan's avatar
wxchan committed
1
# coding: utf-8
2
"""Wrapper for C API of LightGBM."""
wxchan's avatar
wxchan committed
3
4
from __future__ import absolute_import

5
import copy
wxchan's avatar
wxchan committed
6
import ctypes
7
import os
wxchan's avatar
wxchan committed
8
import warnings
wxchan's avatar
wxchan committed
9
from tempfile import NamedTemporaryFile
10
from collections import OrderedDict
wxchan's avatar
wxchan committed
11
12
13
14

import numpy as np
import scipy.sparse

15
from .compat import (PANDAS_INSTALLED, DataFrame, Series, is_dtype_sparse,
16
                     DataTable,
17
18
                     decode_string, string_type,
                     integer_types, numeric_types,
19
                     json, json_default_with_numpy,
20
                     range_, zip_)
wxchan's avatar
wxchan committed
21
22
from .libpath import find_lib_path

wxchan's avatar
wxchan committed
23

wxchan's avatar
wxchan committed
24
def _load_lib():
25
    """Load LightGBM library."""
wxchan's avatar
wxchan committed
26
27
    lib_path = find_lib_path()
    if len(lib_path) == 0:
28
        return None
wxchan's avatar
wxchan committed
29
30
31
32
    lib = ctypes.cdll.LoadLibrary(lib_path[0])
    lib.LGBM_GetLastError.restype = ctypes.c_char_p
    return lib

wxchan's avatar
wxchan committed
33

wxchan's avatar
wxchan committed
34
35
_LIB = _load_lib()

wxchan's avatar
wxchan committed
36

wxchan's avatar
wxchan committed
37
def _safe_call(ret):
38
39
    """Check the return value from C API call.

wxchan's avatar
wxchan committed
40
41
42
    Parameters
    ----------
    ret : int
43
        The return value from C API calls.
wxchan's avatar
wxchan committed
44
45
    """
    if ret != 0:
46
        raise LightGBMError(decode_string(_LIB.LGBM_GetLastError()))
wxchan's avatar
wxchan committed
47

wxchan's avatar
wxchan committed
48

wxchan's avatar
wxchan committed
49
def is_numeric(obj):
50
    """Check whether object is a number or not, include numpy number, etc."""
wxchan's avatar
wxchan committed
51
52
53
    try:
        float(obj)
        return True
wxchan's avatar
wxchan committed
54
55
56
    except (TypeError, ValueError):
        # TypeError: obj is not a string or a number
        # ValueError: invalid literal
wxchan's avatar
wxchan committed
57
58
        return False

wxchan's avatar
wxchan committed
59

wxchan's avatar
wxchan committed
60
def is_numpy_1d_array(data):
61
    """Check whether data is a numpy 1-D array."""
62
    return isinstance(data, np.ndarray) and len(data.shape) == 1
wxchan's avatar
wxchan committed
63

wxchan's avatar
wxchan committed
64

wxchan's avatar
wxchan committed
65
def is_1d_list(data):
66
67
    """Check whether data is a 1-D list."""
    return isinstance(data, list) and (not data or is_numeric(data[0]))
wxchan's avatar
wxchan committed
68

wxchan's avatar
wxchan committed
69

70
def list_to_1d_numpy(data, dtype=np.float32, name='list'):
71
    """Convert data to numpy 1-D array."""
wxchan's avatar
wxchan committed
72
73
74
75
76
77
78
    if is_numpy_1d_array(data):
        if data.dtype == dtype:
            return data
        else:
            return data.astype(dtype=dtype, copy=False)
    elif is_1d_list(data):
        return np.array(data, dtype=dtype, copy=False)
79
    elif isinstance(data, Series):
80
81
        if _get_bad_pandas_dtypes([data.dtypes]):
            raise ValueError('Series.dtypes must be int, float or bool')
82
        return np.array(data, dtype=dtype, copy=False)  # SparseArray should be supported as well
wxchan's avatar
wxchan committed
83
    else:
84
85
        raise TypeError("Wrong type({0}) for {1}.\n"
                        "It should be list, numpy 1-D array or pandas Series".format(type(data).__name__, name))
wxchan's avatar
wxchan committed
86

wxchan's avatar
wxchan committed
87

wxchan's avatar
wxchan committed
88
def cfloat32_array_to_numpy(cptr, length):
89
    """Convert a ctypes float pointer array to a numpy array."""
wxchan's avatar
wxchan committed
90
    if isinstance(cptr, ctypes.POINTER(ctypes.c_float)):
91
        return np.fromiter(cptr, dtype=np.float32, count=length)
wxchan's avatar
wxchan committed
92
    else:
93
        raise RuntimeError('Expected float pointer')
wxchan's avatar
wxchan committed
94

Guolin Ke's avatar
Guolin Ke committed
95

Guolin Ke's avatar
Guolin Ke committed
96
def cfloat64_array_to_numpy(cptr, length):
97
    """Convert a ctypes double pointer array to a numpy array."""
Guolin Ke's avatar
Guolin Ke committed
98
99
100
101
102
    if isinstance(cptr, ctypes.POINTER(ctypes.c_double)):
        return np.fromiter(cptr, dtype=np.float64, count=length)
    else:
        raise RuntimeError('Expected double pointer')

wxchan's avatar
wxchan committed
103

wxchan's avatar
wxchan committed
104
def cint32_array_to_numpy(cptr, length):
105
    """Convert a ctypes int pointer array to a numpy array."""
wxchan's avatar
wxchan committed
106
    if isinstance(cptr, ctypes.POINTER(ctypes.c_int32)):
107
        return np.fromiter(cptr, dtype=np.int32, count=length)
wxchan's avatar
wxchan committed
108
    else:
109
        raise RuntimeError('Expected int pointer')
wxchan's avatar
wxchan committed
110

wxchan's avatar
wxchan committed
111

wxchan's avatar
wxchan committed
112
def c_str(string):
113
    """Convert a Python string to C string."""
wxchan's avatar
wxchan committed
114
115
    return ctypes.c_char_p(string.encode('utf-8'))

wxchan's avatar
wxchan committed
116

wxchan's avatar
wxchan committed
117
def c_array(ctype, values):
118
    """Convert a Python array to C array."""
wxchan's avatar
wxchan committed
119
120
    return (ctype * len(values))(*values)

wxchan's avatar
wxchan committed
121

wxchan's avatar
wxchan committed
122
def param_dict_to_str(data):
123
    """Convert Python dictionary to string, which is passed to C API."""
124
    if data is None or not data:
wxchan's avatar
wxchan committed
125
126
127
        return ""
    pairs = []
    for key, val in data.items():
128
        if isinstance(val, (list, tuple, set)) or is_numpy_1d_array(val):
wxchan's avatar
wxchan committed
129
            pairs.append(str(key) + '=' + ','.join(map(str, val)))
wxchan's avatar
wxchan committed
130
        elif isinstance(val, string_type) or isinstance(val, numeric_types) or is_numeric(val):
wxchan's avatar
wxchan committed
131
            pairs.append(str(key) + '=' + str(val))
132
        elif val is not None:
133
            raise TypeError('Unknown type of parameter:%s, got:%s'
wxchan's avatar
wxchan committed
134
135
                            % (key, type(val).__name__))
    return ' '.join(pairs)
136

wxchan's avatar
wxchan committed
137

138
class _TempFile(object):
139
140
141
142
    def __enter__(self):
        with NamedTemporaryFile(prefix="lightgbm_tmp_", delete=True) as f:
            self.name = f.name
        return self
wxchan's avatar
wxchan committed
143

144
145
146
    def __exit__(self, exc_type, exc_val, exc_tb):
        if os.path.isfile(self.name):
            os.remove(self.name)
wxchan's avatar
wxchan committed
147

148
149
150
151
    def readlines(self):
        with open(self.name, "r+") as f:
            ret = f.readlines()
        return ret
wxchan's avatar
wxchan committed
152

153
154
    def writelines(self, lines):
        with open(self.name, "w+") as f:
155
            f.writelines(lines)
156

wxchan's avatar
wxchan committed
157

158
class LightGBMError(Exception):
159
160
    """Error thrown by LightGBM."""

161
162
163
    pass


164
class _ConfigAliases(object):
165
166
167
    aliases = {"bin_construct_sample_cnt": {"bin_construct_sample_cnt",
                                            "subsample_for_bin"},
               "boosting": {"boosting",
168
169
170
171
172
173
                            "boosting_type",
                            "boost"},
               "categorical_feature": {"categorical_feature",
                                       "cat_feature",
                                       "categorical_column",
                                       "cat_column"},
174
175
               "data_random_seed": {"data_random_seed",
                                    "data_seed"},
176
177
178
179
               "early_stopping_round": {"early_stopping_round",
                                        "early_stopping_rounds",
                                        "early_stopping",
                                        "n_iter_no_change"},
180
181
182
               "enable_bundle": {"enable_bundle",
                                 "is_enable_bundle",
                                 "bundle"},
183
184
185
186
187
               "eval_at": {"eval_at",
                           "ndcg_eval_at",
                           "ndcg_at",
                           "map_eval_at",
                           "map_at"},
188
189
190
191
192
193
               "group_column": {"group_column",
                                "group",
                                "group_id",
                                "query_column",
                                "query",
                                "query_id"},
194
195
               "header": {"header",
                          "has_header"},
196
197
198
199
200
201
202
203
204
               "ignore_column": {"ignore_column",
                                 "ignore_feature",
                                 "blacklist"},
               "is_enable_sparse": {"is_enable_sparse",
                                    "is_sparse",
                                    "enable_sparse",
                                    "sparse"},
               "label_column": {"label_column",
                                "label"},
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
               "machines": {"machines",
                            "workers",
                            "nodes"},
               "metric": {"metric",
                          "metrics",
                          "metric_types"},
               "num_class": {"num_class",
                             "num_classes"},
               "num_iterations": {"num_iterations",
                                  "num_iteration",
                                  "n_iter",
                                  "num_tree",
                                  "num_trees",
                                  "num_round",
                                  "num_rounds",
                                  "num_boost_round",
                                  "n_estimators"},
               "objective": {"objective",
                             "objective_type",
                             "app",
                             "application"},
226
227
228
229
230
               "pre_partition": {"pre_partition",
                                 "is_pre_partition"},
               "two_round": {"two_round",
                             "two_round_loading",
                             "use_two_round_loading"},
231
               "verbosity": {"verbosity",
232
233
234
                             "verbose"},
               "weight_column": {"weight_column",
                                 "weight"}}
235
236
237
238
239

    @classmethod
    def get(cls, *args):
        ret = set()
        for i in args:
240
            ret |= cls.aliases.get(i, {i})
241
242
243
        return ret


244
245
MAX_INT32 = (1 << 31) - 1

246
"""Macro definition of data type in C API of LightGBM"""
wxchan's avatar
wxchan committed
247
248
249
250
C_API_DTYPE_FLOAT32 = 0
C_API_DTYPE_FLOAT64 = 1
C_API_DTYPE_INT32 = 2
C_API_DTYPE_INT64 = 3
Guolin Ke's avatar
Guolin Ke committed
251

252
"""Matrix is row major in Python"""
wxchan's avatar
wxchan committed
253
254
C_API_IS_ROW_MAJOR = 1

255
"""Macro definition of prediction type in C API of LightGBM"""
wxchan's avatar
wxchan committed
256
257
258
C_API_PREDICT_NORMAL = 0
C_API_PREDICT_RAW_SCORE = 1
C_API_PREDICT_LEAF_INDEX = 2
259
C_API_PREDICT_CONTRIB = 3
wxchan's avatar
wxchan committed
260

261
"""Data type of data field"""
wxchan's avatar
wxchan committed
262
263
FIELD_TYPE_MAPPER = {"label": C_API_DTYPE_FLOAT32,
                     "weight": C_API_DTYPE_FLOAT32,
Guolin Ke's avatar
Guolin Ke committed
264
                     "init_score": C_API_DTYPE_FLOAT64,
265
                     "group": C_API_DTYPE_INT32}
wxchan's avatar
wxchan committed
266

wxchan's avatar
wxchan committed
267

268
def convert_from_sliced_object(data):
269
    """Fix the memory of multi-dimensional sliced object."""
270
    if isinstance(data, np.ndarray) and isinstance(data.base, np.ndarray):
271
        if not data.flags.c_contiguous:
272
273
            warnings.warn("Usage of np.ndarray subset (sliced data) is not recommended "
                          "due to it will double the peak memory cost in LightGBM.")
274
275
276
277
            return np.copy(data)
    return data


wxchan's avatar
wxchan committed
278
def c_float_array(data):
279
    """Get pointer of float numpy array / list."""
wxchan's avatar
wxchan committed
280
281
282
    if is_1d_list(data):
        data = np.array(data, copy=False)
    if is_numpy_1d_array(data):
283
284
        data = convert_from_sliced_object(data)
        assert data.flags.c_contiguous
wxchan's avatar
wxchan committed
285
286
287
288
289
290
291
        if data.dtype == np.float32:
            ptr_data = data.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
            type_data = C_API_DTYPE_FLOAT32
        elif data.dtype == np.float64:
            ptr_data = data.ctypes.data_as(ctypes.POINTER(ctypes.c_double))
            type_data = C_API_DTYPE_FLOAT64
        else:
292
            raise TypeError("Expected np.float32 or np.float64, met type({})"
wxchan's avatar
wxchan committed
293
294
                            .format(data.dtype))
    else:
295
        raise TypeError("Unknown type({})".format(type(data).__name__))
296
    return (ptr_data, type_data, data)  # return `data` to avoid the temporary copy is freed
wxchan's avatar
wxchan committed
297

wxchan's avatar
wxchan committed
298

wxchan's avatar
wxchan committed
299
def c_int_array(data):
300
    """Get pointer of int numpy array / list."""
wxchan's avatar
wxchan committed
301
302
303
    if is_1d_list(data):
        data = np.array(data, copy=False)
    if is_numpy_1d_array(data):
304
305
        data = convert_from_sliced_object(data)
        assert data.flags.c_contiguous
wxchan's avatar
wxchan committed
306
307
308
309
310
311
312
        if data.dtype == np.int32:
            ptr_data = data.ctypes.data_as(ctypes.POINTER(ctypes.c_int32))
            type_data = C_API_DTYPE_INT32
        elif data.dtype == np.int64:
            ptr_data = data.ctypes.data_as(ctypes.POINTER(ctypes.c_int64))
            type_data = C_API_DTYPE_INT64
        else:
313
            raise TypeError("Expected np.int32 or np.int64, met type({})"
wxchan's avatar
wxchan committed
314
315
                            .format(data.dtype))
    else:
316
        raise TypeError("Unknown type({})".format(type(data).__name__))
317
    return (ptr_data, type_data, data)  # return `data` to avoid the temporary copy is freed
wxchan's avatar
wxchan committed
318

wxchan's avatar
wxchan committed
319

320
321
322
323
324
325
326
327
328
329
330
def _get_bad_pandas_dtypes(dtypes):
    pandas_dtype_mapper = {'int8': 'int', 'int16': 'int', 'int32': 'int',
                           'int64': 'int', 'uint8': 'int', 'uint16': 'int',
                           'uint32': 'int', 'uint64': 'int', 'bool': 'int',
                           'float16': 'float', 'float32': 'float', 'float64': 'float'}
    bad_indices = [i for i, dtype in enumerate(dtypes) if (dtype.name not in pandas_dtype_mapper
                                                           and (not is_dtype_sparse(dtype)
                                                                or dtype.subtype.name not in pandas_dtype_mapper))]
    return bad_indices


331
def _data_from_pandas(data, feature_name, categorical_feature, pandas_categorical):
332
    if isinstance(data, DataFrame):
333
334
        if len(data.shape) != 2 or data.shape[0] < 1:
            raise ValueError('Input data must be 2 dimensional and non empty.')
335
336
        if feature_name == 'auto' or feature_name is None:
            data = data.rename(columns=str)
337
338
        cat_cols = list(data.select_dtypes(include=['category']).columns)
        cat_cols_not_ordered = [col for col in cat_cols if not data[col].cat.ordered]
339
340
341
342
343
        if pandas_categorical is None:  # train dataset
            pandas_categorical = [list(data[col].cat.categories) for col in cat_cols]
        else:
            if len(cat_cols) != len(pandas_categorical):
                raise ValueError('train and valid dataset categorical_feature do not match.')
344
            for col, category in zip_(cat_cols, pandas_categorical):
345
346
                if list(data[col].cat.categories) != list(category):
                    data[col] = data[col].cat.set_categories(category)
347
        if len(cat_cols):  # cat_cols is list
348
            data = data.copy()  # not alter origin DataFrame
349
            data[cat_cols] = data[cat_cols].apply(lambda x: x.cat.codes).replace({-1: np.nan})
350
351
352
        if categorical_feature is not None:
            if feature_name is None:
                feature_name = list(data.columns)
353
            if categorical_feature == 'auto':  # use cat cols from DataFrame
354
                categorical_feature = cat_cols_not_ordered
355
356
            else:  # use cat cols specified by user
                categorical_feature = list(categorical_feature)
357
358
        if feature_name == 'auto':
            feature_name = list(data.columns)
359
360
        bad_indices = _get_bad_pandas_dtypes(data.dtypes)
        if bad_indices:
361
            raise ValueError("DataFrame.dtypes for data must be int, float or bool.\n"
362
                             "Did not expect the data types in the following fields: "
363
                             + ', '.join(data.columns[bad_indices]))
364
365
366
        data = data.values
        if data.dtype != np.float32 and data.dtype != np.float64:
            data = data.astype(np.float32)
367
368
369
370
371
372
    else:
        if feature_name == 'auto':
            feature_name = None
        if categorical_feature == 'auto':
            categorical_feature = None
    return data, feature_name, categorical_feature, pandas_categorical
373
374
375
376
377
378


def _label_from_pandas(label):
    if isinstance(label, DataFrame):
        if len(label.columns) > 1:
            raise ValueError('DataFrame for label cannot have multiple columns')
379
        if _get_bad_pandas_dtypes(label.dtypes):
380
            raise ValueError('DataFrame.dtypes for label must be int, float or bool')
381
        label = np.ravel(label.values.astype(np.float32, copy=False))
382
383
384
    return label


385
386
387
388
389
390
391
392
393
394
395
def _dump_pandas_categorical(pandas_categorical, file_name=None):
    pandas_str = ('\npandas_categorical:'
                  + json.dumps(pandas_categorical, default=json_default_with_numpy)
                  + '\n')
    if file_name is not None:
        with open(file_name, 'a') as f:
            f.write(pandas_str)
    return pandas_str


def _load_pandas_categorical(file_name=None, model_str=None):
396
397
    pandas_key = 'pandas_categorical:'
    offset = -len(pandas_key)
398
    if file_name is not None:
399
400
401
402
403
404
405
406
407
408
409
410
411
        max_offset = -os.path.getsize(file_name)
        with open(file_name, 'rb') as f:
            while True:
                if offset < max_offset:
                    offset = max_offset
                f.seek(offset, os.SEEK_END)
                lines = f.readlines()
                if len(lines) >= 2:
                    break
                offset *= 2
        last_line = decode_string(lines[-1]).strip()
        if not last_line.startswith(pandas_key):
            last_line = decode_string(lines[-2]).strip()
412
    elif model_str is not None:
413
414
415
416
417
418
        idx = model_str.rfind('\n', 0, offset)
        last_line = model_str[idx:].strip()
    if last_line.startswith(pandas_key):
        return json.loads(last_line[len(pandas_key):])
    else:
        return None
419
420


Guolin Ke's avatar
Guolin Ke committed
421
class _InnerPredictor(object):
422
423
424
425
426
    """_InnerPredictor of LightGBM.

    Not exposed to user.
    Used only for prediction, usually used for continued training.

Nikita Titov's avatar
Nikita Titov committed
427
428
429
    .. note::

        Can be converted from Booster, but cannot be converted to Booster.
Guolin Ke's avatar
Guolin Ke committed
430
    """
431

432
    def __init__(self, model_file=None, booster_handle=None, pred_parameter=None):
433
        """Initialize the _InnerPredictor.
wxchan's avatar
wxchan committed
434
435
436

        Parameters
        ----------
437
        model_file : string or None, optional (default=None)
wxchan's avatar
wxchan committed
438
            Path to the model file.
439
440
441
442
        booster_handle : object or None, optional (default=None)
            Handle of Booster.
        pred_parameter: dict or None, optional (default=None)
            Other parameters for the prediciton.
wxchan's avatar
wxchan committed
443
444
445
446
447
        """
        self.handle = ctypes.c_void_p()
        self.__is_manage_handle = True
        if model_file is not None:
            """Prediction task"""
Guolin Ke's avatar
Guolin Ke committed
448
            out_num_iterations = ctypes.c_int(0)
wxchan's avatar
wxchan committed
449
450
451
452
            _safe_call(_LIB.LGBM_BoosterCreateFromModelfile(
                c_str(model_file),
                ctypes.byref(out_num_iterations),
                ctypes.byref(self.handle)))
Guolin Ke's avatar
Guolin Ke committed
453
            out_num_class = ctypes.c_int(0)
wxchan's avatar
wxchan committed
454
455
456
457
            _safe_call(_LIB.LGBM_BoosterGetNumClasses(
                self.handle,
                ctypes.byref(out_num_class)))
            self.num_class = out_num_class.value
458
            self.num_total_iteration = out_num_iterations.value
459
            self.pandas_categorical = _load_pandas_categorical(file_name=model_file)
wxchan's avatar
wxchan committed
460
        elif booster_handle is not None:
Guolin Ke's avatar
Guolin Ke committed
461
            self.__is_manage_handle = False
wxchan's avatar
wxchan committed
462
            self.handle = booster_handle
Guolin Ke's avatar
Guolin Ke committed
463
            out_num_class = ctypes.c_int(0)
wxchan's avatar
wxchan committed
464
465
466
467
            _safe_call(_LIB.LGBM_BoosterGetNumClasses(
                self.handle,
                ctypes.byref(out_num_class)))
            self.num_class = out_num_class.value
Guolin Ke's avatar
Guolin Ke committed
468
            out_num_iterations = ctypes.c_int(0)
wxchan's avatar
wxchan committed
469
470
471
            _safe_call(_LIB.LGBM_BoosterGetCurrentIteration(
                self.handle,
                ctypes.byref(out_num_iterations)))
472
            self.num_total_iteration = out_num_iterations.value
473
            self.pandas_categorical = None
wxchan's avatar
wxchan committed
474
        else:
475
            raise TypeError('Need model_file or booster_handle to create a predictor')
wxchan's avatar
wxchan committed
476

477
478
        pred_parameter = {} if pred_parameter is None else pred_parameter
        self.pred_parameter = param_dict_to_str(pred_parameter)
cbecker's avatar
cbecker committed
479

wxchan's avatar
wxchan committed
480
    def __del__(self):
481
482
483
484
485
        try:
            if self.__is_manage_handle:
                _safe_call(_LIB.LGBM_BoosterFree(self.handle))
        except AttributeError:
            pass
wxchan's avatar
wxchan committed
486

487
488
489
490
491
    def __getstate__(self):
        this = self.__dict__.copy()
        this.pop('handle', None)
        return this

wxchan's avatar
wxchan committed
492
    def predict(self, data, num_iteration=-1,
493
                raw_score=False, pred_leaf=False, pred_contrib=False, data_has_header=False,
wxchan's avatar
wxchan committed
494
                is_reshape=True):
495
        """Predict logic.
wxchan's avatar
wxchan committed
496
497
498

        Parameters
        ----------
499
        data : string, numpy array, pandas DataFrame, H2O DataTable's Frame or scipy.sparse
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
            Data source for prediction.
            When data type is string, it represents the path of txt file.
        num_iteration : int, optional (default=-1)
            Iteration used for prediction.
        raw_score : bool, optional (default=False)
            Whether to predict raw scores.
        pred_leaf : bool, optional (default=False)
            Whether to predict leaf index.
        pred_contrib : bool, optional (default=False)
            Whether to predict feature contributions.
        data_has_header : bool, optional (default=False)
            Whether data has header.
            Used only for txt data.
        is_reshape : bool, optional (default=True)
            Whether to reshape to (nrow, ncol).
wxchan's avatar
wxchan committed
515
516
517

        Returns
        -------
518
519
        result : numpy array
            Prediction result.
wxchan's avatar
wxchan committed
520
        """
wxchan's avatar
wxchan committed
521
        if isinstance(data, Dataset):
522
            raise TypeError("Cannot use Dataset instance for prediction, please use raw data instead")
523
        data = _data_from_pandas(data, None, None, self.pandas_categorical)[0]
wxchan's avatar
wxchan committed
524
525
526
527
528
        predict_type = C_API_PREDICT_NORMAL
        if raw_score:
            predict_type = C_API_PREDICT_RAW_SCORE
        if pred_leaf:
            predict_type = C_API_PREDICT_LEAF_INDEX
529
530
        if pred_contrib:
            predict_type = C_API_PREDICT_CONTRIB
wxchan's avatar
wxchan committed
531
        int_data_has_header = 1 if data_has_header else 0
532
533
        if num_iteration > self.num_total_iteration:
            num_iteration = self.num_total_iteration
cbecker's avatar
cbecker committed
534

wxchan's avatar
wxchan committed
535
        if isinstance(data, string_type):
536
            with _TempFile() as f:
wxchan's avatar
wxchan committed
537
538
539
                _safe_call(_LIB.LGBM_BoosterPredictForFile(
                    self.handle,
                    c_str(data),
Guolin Ke's avatar
Guolin Ke committed
540
541
542
                    ctypes.c_int(int_data_has_header),
                    ctypes.c_int(predict_type),
                    ctypes.c_int(num_iteration),
543
                    c_str(self.pred_parameter),
wxchan's avatar
wxchan committed
544
545
                    c_str(f.name)))
                lines = f.readlines()
546
547
                nrow = len(lines)
                preds = [float(token) for line in lines for token in line.split('\t')]
Guolin Ke's avatar
Guolin Ke committed
548
                preds = np.array(preds, dtype=np.float64, copy=False)
wxchan's avatar
wxchan committed
549
        elif isinstance(data, scipy.sparse.csr_matrix):
550
            preds, nrow = self.__pred_for_csr(data, num_iteration, predict_type)
Guolin Ke's avatar
Guolin Ke committed
551
        elif isinstance(data, scipy.sparse.csc_matrix):
552
            preds, nrow = self.__pred_for_csc(data, num_iteration, predict_type)
wxchan's avatar
wxchan committed
553
        elif isinstance(data, np.ndarray):
554
            preds, nrow = self.__pred_for_np2d(data, num_iteration, predict_type)
555
556
557
        elif isinstance(data, list):
            try:
                data = np.array(data)
558
            except BaseException:
559
                raise ValueError('Cannot convert data list to numpy array.')
560
            preds, nrow = self.__pred_for_np2d(data, num_iteration, predict_type)
561
562
        elif isinstance(data, DataTable):
            preds, nrow = self.__pred_for_np2d(data.to_numpy(), num_iteration, predict_type)
wxchan's avatar
wxchan committed
563
564
        else:
            try:
565
                warnings.warn('Converting data to scipy sparse matrix.')
wxchan's avatar
wxchan committed
566
                csr = scipy.sparse.csr_matrix(data)
567
            except BaseException:
568
                raise TypeError('Cannot predict data for type {}'.format(type(data).__name__))
569
            preds, nrow = self.__pred_for_csr(csr, num_iteration, predict_type)
wxchan's avatar
wxchan committed
570
571
        if pred_leaf:
            preds = preds.astype(np.int32)
572
        if is_reshape and preds.size != nrow:
wxchan's avatar
wxchan committed
573
            if preds.size % nrow == 0:
574
                preds = preds.reshape(nrow, -1)
wxchan's avatar
wxchan committed
575
            else:
576
                raise ValueError('Length of predict result (%d) cannot be divide nrow (%d)'
wxchan's avatar
wxchan committed
577
578
579
580
                                 % (preds.size, nrow))
        return preds

    def __get_num_preds(self, num_iteration, nrow, predict_type):
581
        """Get size of prediction result."""
582
583
584
585
586
        if nrow > MAX_INT32:
            raise LightGBMError('LightGBM cannot perform prediction for data'
                                'with number of rows greater than MAX_INT32 (%d).\n'
                                'You can split your data into chunks'
                                'and then concatenate predictions for them' % MAX_INT32)
Guolin Ke's avatar
Guolin Ke committed
587
588
589
        n_preds = ctypes.c_int64(0)
        _safe_call(_LIB.LGBM_BoosterCalcNumPredict(
            self.handle,
Guolin Ke's avatar
Guolin Ke committed
590
591
592
            ctypes.c_int(nrow),
            ctypes.c_int(predict_type),
            ctypes.c_int(num_iteration),
Guolin Ke's avatar
Guolin Ke committed
593
594
            ctypes.byref(n_preds)))
        return n_preds.value
wxchan's avatar
wxchan committed
595
596

    def __pred_for_np2d(self, mat, num_iteration, predict_type):
597
        """Predict for a 2-D numpy matrix."""
wxchan's avatar
wxchan committed
598
        if len(mat.shape) != 2:
599
            raise ValueError('Input numpy.ndarray or list must be 2 dimensional')
wxchan's avatar
wxchan committed
600

601
602
603
        def inner_predict(mat, num_iteration, predict_type, preds=None):
            if mat.dtype == np.float32 or mat.dtype == np.float64:
                data = np.array(mat.reshape(mat.size), dtype=mat.dtype, copy=False)
604
            else:  # change non-float data to float data, need to copy
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
                data = np.array(mat.reshape(mat.size), dtype=np.float32)
            ptr_data, type_ptr_data, _ = c_float_array(data)
            n_preds = self.__get_num_preds(num_iteration, mat.shape[0], predict_type)
            if preds is None:
                preds = np.zeros(n_preds, dtype=np.float64)
            elif len(preds.shape) != 1 or len(preds) != n_preds:
                raise ValueError("Wrong length of pre-allocated predict array")
            out_num_preds = ctypes.c_int64(0)
            _safe_call(_LIB.LGBM_BoosterPredictForMat(
                self.handle,
                ptr_data,
                ctypes.c_int(type_ptr_data),
                ctypes.c_int(mat.shape[0]),
                ctypes.c_int(mat.shape[1]),
                ctypes.c_int(C_API_IS_ROW_MAJOR),
                ctypes.c_int(predict_type),
                ctypes.c_int(num_iteration),
                c_str(self.pred_parameter),
                ctypes.byref(out_num_preds),
                preds.ctypes.data_as(ctypes.POINTER(ctypes.c_double))))
            if n_preds != out_num_preds.value:
                raise ValueError("Wrong length for predict results")
            return preds, mat.shape[0]

        nrow = mat.shape[0]
        if nrow > MAX_INT32:
            sections = np.arange(start=MAX_INT32, stop=nrow, step=MAX_INT32)
            # __get_num_preds() cannot work with nrow > MAX_INT32, so calculate overall number of predictions piecemeal
            n_preds = [self.__get_num_preds(num_iteration, i, predict_type) for i in np.diff([0] + list(sections) + [nrow])]
            n_preds_sections = np.array([0] + n_preds, dtype=np.intp).cumsum()
            preds = np.zeros(sum(n_preds), dtype=np.float64)
636
637
            for chunk, (start_idx_pred, end_idx_pred) in zip_(np.array_split(mat, sections),
                                                              zip_(n_preds_sections, n_preds_sections[1:])):
638
639
640
                # avoid memory consumption by arrays concatenation operations
                inner_predict(chunk, num_iteration, predict_type, preds[start_idx_pred:end_idx_pred])
            return preds, nrow
wxchan's avatar
wxchan committed
641
        else:
642
            return inner_predict(mat, num_iteration, predict_type)
wxchan's avatar
wxchan committed
643
644

    def __pred_for_csr(self, csr, num_iteration, predict_type):
645
        """Predict for a CSR data."""
646
647
648
649
650
651
652
653
654
655
656
657
        def inner_predict(csr, num_iteration, predict_type, preds=None):
            nrow = len(csr.indptr) - 1
            n_preds = self.__get_num_preds(num_iteration, nrow, predict_type)
            if preds is None:
                preds = np.zeros(n_preds, dtype=np.float64)
            elif len(preds.shape) != 1 or len(preds) != n_preds:
                raise ValueError("Wrong length of pre-allocated predict array")
            out_num_preds = ctypes.c_int64(0)

            ptr_indptr, type_ptr_indptr, __ = c_int_array(csr.indptr)
            ptr_data, type_ptr_data, _ = c_float_array(csr.data)

658
659
660
            assert csr.shape[1] <= MAX_INT32
            csr.indices = csr.indices.astype(np.int32, copy=False)

661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
            _safe_call(_LIB.LGBM_BoosterPredictForCSR(
                self.handle,
                ptr_indptr,
                ctypes.c_int32(type_ptr_indptr),
                csr.indices.ctypes.data_as(ctypes.POINTER(ctypes.c_int32)),
                ptr_data,
                ctypes.c_int(type_ptr_data),
                ctypes.c_int64(len(csr.indptr)),
                ctypes.c_int64(len(csr.data)),
                ctypes.c_int64(csr.shape[1]),
                ctypes.c_int(predict_type),
                ctypes.c_int(num_iteration),
                c_str(self.pred_parameter),
                ctypes.byref(out_num_preds),
                preds.ctypes.data_as(ctypes.POINTER(ctypes.c_double))))
            if n_preds != out_num_preds.value:
                raise ValueError("Wrong length for predict results")
            return preds, nrow
wxchan's avatar
wxchan committed
679

680
681
682
683
684
685
686
687
688
689
690
691
692
693
        nrow = len(csr.indptr) - 1
        if nrow > MAX_INT32:
            sections = [0] + list(np.arange(start=MAX_INT32, stop=nrow, step=MAX_INT32)) + [nrow]
            # __get_num_preds() cannot work with nrow > MAX_INT32, so calculate overall number of predictions piecemeal
            n_preds = [self.__get_num_preds(num_iteration, i, predict_type) for i in np.diff(sections)]
            n_preds_sections = np.array([0] + n_preds, dtype=np.intp).cumsum()
            preds = np.zeros(sum(n_preds), dtype=np.float64)
            for (start_idx, end_idx), (start_idx_pred, end_idx_pred) in zip_(zip_(sections, sections[1:]),
                                                                             zip_(n_preds_sections, n_preds_sections[1:])):
                # avoid memory consumption by arrays concatenation operations
                inner_predict(csr[start_idx:end_idx], num_iteration, predict_type, preds[start_idx_pred:end_idx_pred])
            return preds, nrow
        else:
            return inner_predict(csr, num_iteration, predict_type)
Guolin Ke's avatar
Guolin Ke committed
694
695

    def __pred_for_csc(self, csc, num_iteration, predict_type):
696
        """Predict for a CSC data."""
Guolin Ke's avatar
Guolin Ke committed
697
        nrow = csc.shape[0]
698
699
        if nrow > MAX_INT32:
            return self.__pred_for_csr(csc.tocsr(), num_iteration, predict_type)
Guolin Ke's avatar
Guolin Ke committed
700
701
702
703
        n_preds = self.__get_num_preds(num_iteration, nrow, predict_type)
        preds = np.zeros(n_preds, dtype=np.float64)
        out_num_preds = ctypes.c_int64(0)

704
705
        ptr_indptr, type_ptr_indptr, __ = c_int_array(csc.indptr)
        ptr_data, type_ptr_data, _ = c_float_array(csc.data)
Guolin Ke's avatar
Guolin Ke committed
706

707
708
709
        assert csc.shape[0] <= MAX_INT32
        csc.indices = csc.indices.astype(np.int32, copy=False)

Guolin Ke's avatar
Guolin Ke committed
710
711
712
        _safe_call(_LIB.LGBM_BoosterPredictForCSC(
            self.handle,
            ptr_indptr,
Guolin Ke's avatar
Guolin Ke committed
713
            ctypes.c_int32(type_ptr_indptr),
Guolin Ke's avatar
Guolin Ke committed
714
715
            csc.indices.ctypes.data_as(ctypes.POINTER(ctypes.c_int32)),
            ptr_data,
Guolin Ke's avatar
Guolin Ke committed
716
717
718
719
720
721
            ctypes.c_int(type_ptr_data),
            ctypes.c_int64(len(csc.indptr)),
            ctypes.c_int64(len(csc.data)),
            ctypes.c_int64(csc.shape[0]),
            ctypes.c_int(predict_type),
            ctypes.c_int(num_iteration),
722
            c_str(self.pred_parameter),
Guolin Ke's avatar
Guolin Ke committed
723
            ctypes.byref(out_num_preds),
wxchan's avatar
wxchan committed
724
            preds.ctypes.data_as(ctypes.POINTER(ctypes.c_double))))
wxchan's avatar
wxchan committed
725
        if n_preds != out_num_preds.value:
726
            raise ValueError("Wrong length for predict results")
wxchan's avatar
wxchan committed
727
728
        return preds, nrow

wxchan's avatar
wxchan committed
729

wxchan's avatar
wxchan committed
730
731
class Dataset(object):
    """Dataset in LightGBM."""
732

733
    def __init__(self, data, label=None, reference=None,
734
                 weight=None, group=None, init_score=None, silent=False,
735
                 feature_name='auto', categorical_feature='auto', params=None,
wxchan's avatar
wxchan committed
736
                 free_raw_data=True):
737
        """Initialize Dataset.
738

wxchan's avatar
wxchan committed
739
740
        Parameters
        ----------
741
        data : string, numpy array, pandas DataFrame, H2O DataTable's Frame, scipy.sparse or list of numpy arrays
wxchan's avatar
wxchan committed
742
            Data source of Dataset.
743
            If string, it represents the path to txt file.
744
        label : list, numpy 1-D array, pandas Series / one-column DataFrame or None, optional (default=None)
745
746
747
            Label of the data.
        reference : Dataset or None, optional (default=None)
            If this is Dataset for validation, training data should be used as reference.
748
        weight : list, numpy 1-D array, pandas Series or None, optional (default=None)
wxchan's avatar
wxchan committed
749
            Weight for each instance.
750
        group : list, numpy 1-D array, pandas Series or None, optional (default=None)
751
            Group/query size for Dataset.
752
        init_score : list, numpy 1-D array, pandas Series or None, optional (default=None)
753
            Init score for Dataset.
754
755
756
757
758
759
760
761
762
        silent : bool, optional (default=False)
            Whether to print messages during construction.
        feature_name : list of strings or 'auto', optional (default="auto")
            Feature names.
            If 'auto' and data is pandas DataFrame, data columns names are used.
        categorical_feature : list of strings or int, or 'auto', optional (default="auto")
            Categorical features.
            If list of int, interpreted as indices.
            If list of strings, interpreted as feature names (need to specify ``feature_name`` as well).
763
            If 'auto' and data is pandas DataFrame, pandas unordered categorical columns are used.
764
            All values in categorical features should be less than int32 max value (2147483647).
765
            Large values could be memory consuming. Consider using consecutive integers starting from zero.
766
            All negative values in categorical features will be treated as missing values.
767
            The output cannot be monotonically constrained with respect to a categorical feature.
Nikita Titov's avatar
Nikita Titov committed
768
        params : dict or None, optional (default=None)
769
            Other parameters for Dataset.
Nikita Titov's avatar
Nikita Titov committed
770
        free_raw_data : bool, optional (default=True)
771
            If True, raw data is freed after constructing inner Dataset.
wxchan's avatar
wxchan committed
772
        """
wxchan's avatar
wxchan committed
773
774
775
776
777
778
        self.handle = None
        self.data = data
        self.label = label
        self.reference = reference
        self.weight = weight
        self.group = group
779
        self.init_score = init_score
wxchan's avatar
wxchan committed
780
781
        self.silent = silent
        self.feature_name = feature_name
782
        self.categorical_feature = categorical_feature
783
        self.params = copy.deepcopy(params)
wxchan's avatar
wxchan committed
784
785
        self.free_raw_data = free_raw_data
        self.used_indices = None
786
        self.need_slice = True
wxchan's avatar
wxchan committed
787
        self._predictor = None
788
        self.pandas_categorical = None
789
        self.params_back_up = None
790
791
        self.feature_penalty = None
        self.monotone_constraints = None
792
        self.version = 0
wxchan's avatar
wxchan committed
793
794

    def __del__(self):
795
796
797
798
        try:
            self._free_handle()
        except AttributeError:
            pass
799

800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
    def get_params(self):
        """Get the used parameters in the Dataset.

        Returns
        -------
        params : dict or None
            The used parameters in this Dataset object.
        """
        if self.params is not None:
            # no min_data, nthreads and verbose in this function
            dataset_params = _ConfigAliases.get("bin_construct_sample_cnt",
                                                "categorical_feature",
                                                "data_random_seed",
                                                "enable_bundle",
                                                "feature_pre_filter",
                                                "forcedbins_filename",
                                                "group_column",
                                                "header",
                                                "ignore_column",
                                                "is_enable_sparse",
                                                "label_column",
                                                "max_bin",
                                                "max_bin_by_feature",
                                                "min_data_in_bin",
                                                "pre_partition",
                                                "two_round",
                                                "use_missing",
                                                "weight_column",
                                                "zero_as_missing")
            return {k: v for k, v in self.params.items() if k in dataset_params}

831
    def _free_handle(self):
832
        if self.handle is not None:
833
            _safe_call(_LIB.LGBM_DatasetFree(self.handle))
834
            self.handle = None
Guolin Ke's avatar
Guolin Ke committed
835
836
837
        self.need_slice = True
        if self.used_indices is not None:
            self.data = None
Nikita Titov's avatar
Nikita Titov committed
838
        return self
wxchan's avatar
wxchan committed
839

Guolin Ke's avatar
Guolin Ke committed
840
841
842
843
    def _set_init_score_by_predictor(self, predictor, data, used_indices=None):
        data_has_header = False
        if isinstance(data, string_type):
            # check data has header or not
844
            data_has_header = any(self.params.get(alias, False) for alias in _ConfigAliases.get("header"))
Guolin Ke's avatar
Guolin Ke committed
845
846
847
848
849
850
851
852
853
854
855
856
        init_score = predictor.predict(data,
                                       raw_score=True,
                                       data_has_header=data_has_header,
                                       is_reshape=False)
        num_data = self.num_data()
        if used_indices is not None:
            assert not self.need_slice
            if isinstance(data, string_type):
                sub_init_score = np.zeros(num_data * predictor.num_class, dtype=np.float32)
                assert num_data == len(used_indices)
                for i in range_(len(used_indices)):
                    for j in range_(predictor.num_class):
857
                        sub_init_score[i * predictor.num_class + j] = init_score[used_indices[i] * predictor.num_class + j]
Guolin Ke's avatar
Guolin Ke committed
858
859
860
861
862
863
864
865
866
867
                init_score = sub_init_score
        if predictor.num_class > 1:
            # need to regroup init_score
            new_init_score = np.zeros(init_score.size, dtype=np.float32)
            for i in range_(num_data):
                for j in range_(predictor.num_class):
                    new_init_score[j * num_data + i] = init_score[i * predictor.num_class + j]
            init_score = new_init_score
        self.set_init_score(init_score)

868
    def _lazy_init(self, data, label=None, reference=None,
869
                   weight=None, group=None, init_score=None, predictor=None,
wxchan's avatar
wxchan committed
870
                   silent=False, feature_name='auto',
871
                   categorical_feature='auto', params=None):
wxchan's avatar
wxchan committed
872
873
        if data is None:
            self.handle = None
Nikita Titov's avatar
Nikita Titov committed
874
            return self
Guolin Ke's avatar
Guolin Ke committed
875
876
877
        if reference is not None:
            self.pandas_categorical = reference.pandas_categorical
            categorical_feature = reference.categorical_feature
878
879
880
881
        data, feature_name, categorical_feature, self.pandas_categorical = _data_from_pandas(data,
                                                                                             feature_name,
                                                                                             categorical_feature,
                                                                                             self.pandas_categorical)
wxchan's avatar
wxchan committed
882
        label = _label_from_pandas(label)
Guolin Ke's avatar
Guolin Ke committed
883

884
        # process for args
wxchan's avatar
wxchan committed
885
        params = {} if params is None else params
886
887
888
        args_names = (getattr(self.__class__, '_lazy_init')
                      .__code__
                      .co_varnames[:getattr(self.__class__, '_lazy_init').__code__.co_argcount])
889
890
        for key, _ in params.items():
            if key in args_names:
891
892
893
                warnings.warn('{0} keyword has been found in `params` and will be ignored.\n'
                              'Please use {0} argument of the Dataset constructor to pass this parameter.'
                              .format(key))
894
        # user can set verbose with params, it has higher priority
895
        if not any(verbose_alias in params for verbose_alias in _ConfigAliases.get("verbosity")) and silent:
896
            params["verbose"] = -1
897
        # get categorical features
898
899
900
901
902
903
904
905
906
907
908
909
910
        if categorical_feature is not None:
            categorical_indices = set()
            feature_dict = {}
            if feature_name is not None:
                feature_dict = {name: i for i, name in enumerate(feature_name)}
            for name in categorical_feature:
                if isinstance(name, string_type) and name in feature_dict:
                    categorical_indices.add(feature_dict[name])
                elif isinstance(name, integer_types):
                    categorical_indices.add(name)
                else:
                    raise TypeError("Wrong type({}) or unknown name({}) in categorical_feature"
                                    .format(type(name).__name__, name))
911
            if categorical_indices:
912
913
914
915
                for cat_alias in _ConfigAliases.get("categorical_feature"):
                    if cat_alias in params:
                        warnings.warn('{} in param dict is overridden.'.format(cat_alias))
                        params.pop(cat_alias, None)
916
                params['categorical_column'] = sorted(categorical_indices)
917

wxchan's avatar
wxchan committed
918
        params_str = param_dict_to_str(params)
919
        self.params = params
920
        # process for reference dataset
wxchan's avatar
wxchan committed
921
        ref_dataset = None
wxchan's avatar
wxchan committed
922
        if isinstance(reference, Dataset):
923
            ref_dataset = reference.construct().handle
wxchan's avatar
wxchan committed
924
925
        elif reference is not None:
            raise TypeError('Reference dataset should be None or dataset instance')
926
        # start construct data
wxchan's avatar
wxchan committed
927
        if isinstance(data, string_type):
wxchan's avatar
wxchan committed
928
929
930
931
932
933
934
935
            self.handle = ctypes.c_void_p()
            _safe_call(_LIB.LGBM_DatasetCreateFromFile(
                c_str(data),
                c_str(params_str),
                ref_dataset,
                ctypes.byref(self.handle)))
        elif isinstance(data, scipy.sparse.csr_matrix):
            self.__init_from_csr(data, params_str, ref_dataset)
Guolin Ke's avatar
Guolin Ke committed
936
937
        elif isinstance(data, scipy.sparse.csc_matrix):
            self.__init_from_csc(data, params_str, ref_dataset)
wxchan's avatar
wxchan committed
938
939
        elif isinstance(data, np.ndarray):
            self.__init_from_np2d(data, params_str, ref_dataset)
940
941
        elif isinstance(data, list) and len(data) > 0 and all(isinstance(x, np.ndarray) for x in data):
            self.__init_from_list_np2d(data, params_str, ref_dataset)
942
943
        elif isinstance(data, DataTable):
            self.__init_from_np2d(data.to_numpy(), params_str, ref_dataset)
wxchan's avatar
wxchan committed
944
945
946
947
        else:
            try:
                csr = scipy.sparse.csr_matrix(data)
                self.__init_from_csr(csr, params_str, ref_dataset)
948
            except BaseException:
wxchan's avatar
wxchan committed
949
                raise TypeError('Cannot initialize Dataset from {}'.format(type(data).__name__))
wxchan's avatar
wxchan committed
950
951
952
        if label is not None:
            self.set_label(label)
        if self.get_label() is None:
953
            raise ValueError("Label should not be None")
wxchan's avatar
wxchan committed
954
955
956
957
        if weight is not None:
            self.set_weight(weight)
        if group is not None:
            self.set_group(group)
958
959
960
        if isinstance(predictor, _InnerPredictor):
            if self._predictor is None and init_score is not None:
                warnings.warn("The init_score will be overridden by the prediction of init_model.")
Guolin Ke's avatar
Guolin Ke committed
961
            self._set_init_score_by_predictor(predictor, data)
962
963
        elif init_score is not None:
            self.set_init_score(init_score)
Guolin Ke's avatar
Guolin Ke committed
964
965
        elif predictor is not None:
            raise TypeError('Wrong predictor type {}'.format(type(predictor).__name__))
Guolin Ke's avatar
Guolin Ke committed
966
        # set feature names
Nikita Titov's avatar
Nikita Titov committed
967
        return self.set_feature_name(feature_name)
wxchan's avatar
wxchan committed
968
969

    def __init_from_np2d(self, mat, params_str, ref_dataset):
970
        """Initialize data from a 2-D numpy matrix."""
wxchan's avatar
wxchan committed
971
972
973
974
975
976
        if len(mat.shape) != 2:
            raise ValueError('Input numpy.ndarray must be 2 dimensional')

        self.handle = ctypes.c_void_p()
        if mat.dtype == np.float32 or mat.dtype == np.float64:
            data = np.array(mat.reshape(mat.size), dtype=mat.dtype, copy=False)
977
        else:  # change non-float data to float data, need to copy
wxchan's avatar
wxchan committed
978
979
            data = np.array(mat.reshape(mat.size), dtype=np.float32)

980
        ptr_data, type_ptr_data, _ = c_float_array(data)
wxchan's avatar
wxchan committed
981
982
        _safe_call(_LIB.LGBM_DatasetCreateFromMat(
            ptr_data,
Guolin Ke's avatar
Guolin Ke committed
983
984
985
986
            ctypes.c_int(type_ptr_data),
            ctypes.c_int(mat.shape[0]),
            ctypes.c_int(mat.shape[1]),
            ctypes.c_int(C_API_IS_ROW_MAJOR),
wxchan's avatar
wxchan committed
987
988
989
            c_str(params_str),
            ref_dataset,
            ctypes.byref(self.handle)))
Nikita Titov's avatar
Nikita Titov committed
990
        return self
wxchan's avatar
wxchan committed
991

992
    def __init_from_list_np2d(self, mats, params_str, ref_dataset):
993
        """Initialize data from a list of 2-D numpy matrices."""
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
        ncol = mats[0].shape[1]
        nrow = np.zeros((len(mats),), np.int32)
        if mats[0].dtype == np.float64:
            ptr_data = (ctypes.POINTER(ctypes.c_double) * len(mats))()
        else:
            ptr_data = (ctypes.POINTER(ctypes.c_float) * len(mats))()

        holders = []
        type_ptr_data = None

        for i, mat in enumerate(mats):
            if len(mat.shape) != 2:
                raise ValueError('Input numpy.ndarray must be 2 dimensional')

            if mat.shape[1] != ncol:
                raise ValueError('Input arrays must have same number of columns')

            nrow[i] = mat.shape[0]

            if mat.dtype == np.float32 or mat.dtype == np.float64:
                mats[i] = np.array(mat.reshape(mat.size), dtype=mat.dtype, copy=False)
1015
            else:  # change non-float data to float data, need to copy
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
                mats[i] = np.array(mat.reshape(mat.size), dtype=np.float32)

            chunk_ptr_data, chunk_type_ptr_data, holder = c_float_array(mats[i])
            if type_ptr_data is not None and chunk_type_ptr_data != type_ptr_data:
                raise ValueError('Input chunks must have same type')
            ptr_data[i] = chunk_ptr_data
            type_ptr_data = chunk_type_ptr_data
            holders.append(holder)

        self.handle = ctypes.c_void_p()
        _safe_call(_LIB.LGBM_DatasetCreateFromMats(
            ctypes.c_int(len(mats)),
            ctypes.cast(ptr_data, ctypes.POINTER(ctypes.POINTER(ctypes.c_double))),
            ctypes.c_int(type_ptr_data),
            nrow.ctypes.data_as(ctypes.POINTER(ctypes.c_int32)),
            ctypes.c_int(ncol),
            ctypes.c_int(C_API_IS_ROW_MAJOR),
            c_str(params_str),
            ref_dataset,
            ctypes.byref(self.handle)))
Nikita Titov's avatar
Nikita Titov committed
1036
        return self
1037

wxchan's avatar
wxchan committed
1038
    def __init_from_csr(self, csr, params_str, ref_dataset):
1039
        """Initialize data from a CSR matrix."""
wxchan's avatar
wxchan committed
1040
        if len(csr.indices) != len(csr.data):
1041
            raise ValueError('Length mismatch: {} vs {}'.format(len(csr.indices), len(csr.data)))
wxchan's avatar
wxchan committed
1042
1043
        self.handle = ctypes.c_void_p()

1044
1045
        ptr_indptr, type_ptr_indptr, __ = c_int_array(csr.indptr)
        ptr_data, type_ptr_data, _ = c_float_array(csr.data)
wxchan's avatar
wxchan committed
1046

1047
1048
1049
        assert csr.shape[1] <= MAX_INT32
        csr.indices = csr.indices.astype(np.int32, copy=False)

wxchan's avatar
wxchan committed
1050
1051
        _safe_call(_LIB.LGBM_DatasetCreateFromCSR(
            ptr_indptr,
Guolin Ke's avatar
Guolin Ke committed
1052
            ctypes.c_int(type_ptr_indptr),
wxchan's avatar
wxchan committed
1053
1054
            csr.indices.ctypes.data_as(ctypes.POINTER(ctypes.c_int32)),
            ptr_data,
Guolin Ke's avatar
Guolin Ke committed
1055
1056
1057
1058
            ctypes.c_int(type_ptr_data),
            ctypes.c_int64(len(csr.indptr)),
            ctypes.c_int64(len(csr.data)),
            ctypes.c_int64(csr.shape[1]),
wxchan's avatar
wxchan committed
1059
1060
1061
            c_str(params_str),
            ref_dataset,
            ctypes.byref(self.handle)))
Nikita Titov's avatar
Nikita Titov committed
1062
        return self
wxchan's avatar
wxchan committed
1063

Guolin Ke's avatar
Guolin Ke committed
1064
    def __init_from_csc(self, csc, params_str, ref_dataset):
1065
        """Initialize data from a CSC matrix."""
Guolin Ke's avatar
Guolin Ke committed
1066
1067
1068
1069
        if len(csc.indices) != len(csc.data):
            raise ValueError('Length mismatch: {} vs {}'.format(len(csc.indices), len(csc.data)))
        self.handle = ctypes.c_void_p()

1070
1071
        ptr_indptr, type_ptr_indptr, __ = c_int_array(csc.indptr)
        ptr_data, type_ptr_data, _ = c_float_array(csc.data)
Guolin Ke's avatar
Guolin Ke committed
1072

1073
1074
1075
        assert csc.shape[0] <= MAX_INT32
        csc.indices = csc.indices.astype(np.int32, copy=False)

Guolin Ke's avatar
Guolin Ke committed
1076
1077
        _safe_call(_LIB.LGBM_DatasetCreateFromCSC(
            ptr_indptr,
Guolin Ke's avatar
Guolin Ke committed
1078
            ctypes.c_int(type_ptr_indptr),
Guolin Ke's avatar
Guolin Ke committed
1079
1080
            csc.indices.ctypes.data_as(ctypes.POINTER(ctypes.c_int32)),
            ptr_data,
Guolin Ke's avatar
Guolin Ke committed
1081
1082
1083
1084
            ctypes.c_int(type_ptr_data),
            ctypes.c_int64(len(csc.indptr)),
            ctypes.c_int64(len(csc.data)),
            ctypes.c_int64(csc.shape[0]),
Guolin Ke's avatar
Guolin Ke committed
1085
1086
1087
            c_str(params_str),
            ref_dataset,
            ctypes.byref(self.handle)))
Nikita Titov's avatar
Nikita Titov committed
1088
        return self
Guolin Ke's avatar
Guolin Ke committed
1089

wxchan's avatar
wxchan committed
1090
    def construct(self):
1091
1092
1093
1094
1095
        """Lazy init.

        Returns
        -------
        self : Dataset
Nikita Titov's avatar
Nikita Titov committed
1096
            Constructed Dataset object.
1097
        """
1098
        if self.handle is None:
wxchan's avatar
wxchan committed
1099
            if self.reference is not None:
1100
1101
1102
1103
                reference_params = self.reference.get_params()
                if self.get_params() != reference_params:
                    warnings.warn('Overriding the parameters from Reference Dataset.')
                    self._update_params(reference_params)
wxchan's avatar
wxchan committed
1104
                if self.used_indices is None:
1105
                    # create valid
1106
                    self._lazy_init(self.data, label=self.label, reference=self.reference,
1107
1108
                                    weight=self.weight, group=self.group,
                                    init_score=self.init_score, predictor=self._predictor,
1109
                                    silent=self.silent, feature_name=self.feature_name, params=self.params)
wxchan's avatar
wxchan committed
1110
                else:
1111
                    # construct subset
wxchan's avatar
wxchan committed
1112
                    used_indices = list_to_1d_numpy(self.used_indices, np.int32, name='used_indices')
1113
                    assert used_indices.flags.c_contiguous
Guolin Ke's avatar
Guolin Ke committed
1114
                    if self.reference.group is not None:
1115
                        group_info = np.array(self.reference.group).astype(np.int32, copy=False)
1116
1117
                        _, self.group = np.unique(np.repeat(range_(len(group_info)), repeats=group_info)[self.used_indices],
                                                  return_counts=True)
1118
                    self.handle = ctypes.c_void_p()
wxchan's avatar
wxchan committed
1119
1120
                    params_str = param_dict_to_str(self.params)
                    _safe_call(_LIB.LGBM_DatasetGetSubset(
1121
                        self.reference.construct().handle,
wxchan's avatar
wxchan committed
1122
                        used_indices.ctypes.data_as(ctypes.POINTER(ctypes.c_int32)),
Guolin Ke's avatar
Guolin Ke committed
1123
                        ctypes.c_int(used_indices.shape[0]),
wxchan's avatar
wxchan committed
1124
1125
                        c_str(params_str),
                        ctypes.byref(self.handle)))
Guolin Ke's avatar
Guolin Ke committed
1126
1127
                    if not self.free_raw_data:
                        self.get_data()
Guolin Ke's avatar
Guolin Ke committed
1128
1129
                    if self.group is not None:
                        self.set_group(self.group)
wxchan's avatar
wxchan committed
1130
1131
                    if self.get_label() is None:
                        raise ValueError("Label should not be None.")
Guolin Ke's avatar
Guolin Ke committed
1132
1133
1134
                    if isinstance(self._predictor, _InnerPredictor) and self._predictor is not self.reference._predictor:
                        self.get_data()
                        self._set_init_score_by_predictor(self._predictor, self.data, used_indices)
wxchan's avatar
wxchan committed
1135
            else:
1136
                # create train
1137
                self._lazy_init(self.data, label=self.label,
1138
1139
1140
                                weight=self.weight, group=self.group,
                                init_score=self.init_score, predictor=self._predictor,
                                silent=self.silent, feature_name=self.feature_name,
1141
                                categorical_feature=self.categorical_feature, params=self.params)
wxchan's avatar
wxchan committed
1142
1143
1144
            if self.free_raw_data:
                self.data = None
        return self
wxchan's avatar
wxchan committed
1145

wxchan's avatar
wxchan committed
1146
    def create_valid(self, data, label=None, weight=None, group=None,
1147
                     init_score=None, silent=False, params=None):
1148
        """Create validation data align with current Dataset.
wxchan's avatar
wxchan committed
1149
1150
1151

        Parameters
        ----------
1152
        data : string, numpy array, pandas DataFrame, H2O DataTable's Frame, scipy.sparse or list of numpy arrays
wxchan's avatar
wxchan committed
1153
            Data source of Dataset.
1154
            If string, it represents the path to txt file.
1155
        label : list, numpy 1-D array, pandas Series / one-column DataFrame or None, optional (default=None)
1156
1157
            Label of the data.
        weight : list, numpy 1-D array, pandas Series or None, optional (default=None)
wxchan's avatar
wxchan committed
1158
            Weight for each instance.
1159
        group : list, numpy 1-D array, pandas Series or None, optional (default=None)
1160
            Group/query size for Dataset.
1161
        init_score : list, numpy 1-D array, pandas Series or None, optional (default=None)
1162
            Init score for Dataset.
1163
1164
        silent : bool, optional (default=False)
            Whether to print messages during construction.
Nikita Titov's avatar
Nikita Titov committed
1165
        params : dict or None, optional (default=None)
1166
            Other parameters for validation Dataset.
1167
1168
1169

        Returns
        -------
Nikita Titov's avatar
Nikita Titov committed
1170
1171
        valid : Dataset
            Validation Dataset with reference to self.
wxchan's avatar
wxchan committed
1172
        """
1173
        ret = Dataset(data, label=label, reference=self,
1174
1175
                      weight=weight, group=group, init_score=init_score,
                      silent=silent, params=params, free_raw_data=self.free_raw_data)
wxchan's avatar
wxchan committed
1176
        ret._predictor = self._predictor
1177
        ret.pandas_categorical = self.pandas_categorical
wxchan's avatar
wxchan committed
1178
        return ret
wxchan's avatar
wxchan committed
1179

wxchan's avatar
wxchan committed
1180
    def subset(self, used_indices, params=None):
1181
        """Get subset of current Dataset.
wxchan's avatar
wxchan committed
1182
1183
1184
1185

        Parameters
        ----------
        used_indices : list of int
1186
            Indices used to create the subset.
Nikita Titov's avatar
Nikita Titov committed
1187
        params : dict or None, optional (default=None)
1188
            These parameters will be passed to Dataset constructor.
1189
1190
1191
1192
1193

        Returns
        -------
        subset : Dataset
            Subset of the current Dataset.
wxchan's avatar
wxchan committed
1194
        """
wxchan's avatar
wxchan committed
1195
1196
        if params is None:
            params = self.params
wxchan's avatar
wxchan committed
1197
        ret = Dataset(None, reference=self, feature_name=self.feature_name,
1198
1199
                      categorical_feature=self.categorical_feature, params=params,
                      free_raw_data=self.free_raw_data)
wxchan's avatar
wxchan committed
1200
        ret._predictor = self._predictor
1201
        ret.pandas_categorical = self.pandas_categorical
1202
        ret.used_indices = sorted(used_indices)
wxchan's avatar
wxchan committed
1203
1204
1205
        return ret

    def save_binary(self, filename):
1206
        """Save Dataset to a binary file.
wxchan's avatar
wxchan committed
1207

1208
1209
1210
1211
1212
        .. note::

            Please note that `init_score` is not saved in binary file.
            If you need it, please set it again after loading Dataset.

wxchan's avatar
wxchan committed
1213
1214
1215
1216
        Parameters
        ----------
        filename : string
            Name of the output file.
Nikita Titov's avatar
Nikita Titov committed
1217
1218
1219
1220
1221

        Returns
        -------
        self : Dataset
            Returns self.
wxchan's avatar
wxchan committed
1222
1223
1224
1225
        """
        _safe_call(_LIB.LGBM_DatasetSaveBinary(
            self.construct().handle,
            c_str(filename)))
Nikita Titov's avatar
Nikita Titov committed
1226
        return self
wxchan's avatar
wxchan committed
1227
1228

    def _update_params(self, params):
1229
1230
        if not params:
            return self
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
        params = copy.deepcopy(params)

        def update():
            if not self.params:
                self.params = params
            else:
                self.params_back_up = copy.deepcopy(self.params)
                self.params.update(params)

        if self.handle is None:
            update()
        elif params is not None:
            ret = _LIB.LGBM_DatasetUpdateParamChecking(
                c_str(param_dict_to_str(self.params)),
                c_str(param_dict_to_str(params)))
            if ret != 0:
                # could be updated if data is not freed
                if self.data is not None:
                    update()
                    self._free_handle()
                else:
                    raise LightGBMError(decode_string(_LIB.LGBM_GetLastError()))
Nikita Titov's avatar
Nikita Titov committed
1253
        return self
wxchan's avatar
wxchan committed
1254

1255
    def _reverse_update_params(self):
1256
1257
1258
        if self.handle is None:
            self.params = copy.deepcopy(self.params_back_up)
            self.params_back_up = None
Nikita Titov's avatar
Nikita Titov committed
1259
        return self
1260

wxchan's avatar
wxchan committed
1261
    def set_field(self, field_name, data):
wxchan's avatar
wxchan committed
1262
        """Set property into the Dataset.
wxchan's avatar
wxchan committed
1263
1264
1265

        Parameters
        ----------
Nikita Titov's avatar
Nikita Titov committed
1266
        field_name : string
1267
            The field name of the information.
1268
        data : list, numpy 1-D array, pandas Series or None
1269
            The array of data to be set.
Nikita Titov's avatar
Nikita Titov committed
1270
1271
1272
1273
1274

        Returns
        -------
        self : Dataset
            Dataset with set property.
wxchan's avatar
wxchan committed
1275
        """
1276
1277
        if self.handle is None:
            raise Exception("Cannot set %s before construct dataset" % field_name)
wxchan's avatar
wxchan committed
1278
        if data is None:
1279
            # set to None
wxchan's avatar
wxchan committed
1280
1281
1282
1283
            _safe_call(_LIB.LGBM_DatasetSetField(
                self.handle,
                c_str(field_name),
                None,
Guolin Ke's avatar
Guolin Ke committed
1284
1285
                ctypes.c_int(0),
                ctypes.c_int(FIELD_TYPE_MAPPER[field_name])))
Nikita Titov's avatar
Nikita Titov committed
1286
            return self
Guolin Ke's avatar
Guolin Ke committed
1287
1288
1289
1290
1291
        dtype = np.float32
        if field_name == 'group':
            dtype = np.int32
        elif field_name == 'init_score':
            dtype = np.float64
1292
        data = list_to_1d_numpy(data, dtype, name=field_name)
1293
1294
        if data.dtype == np.float32 or data.dtype == np.float64:
            ptr_data, type_data, _ = c_float_array(data)
wxchan's avatar
wxchan committed
1295
        elif data.dtype == np.int32:
1296
            ptr_data, type_data, _ = c_int_array(data)
wxchan's avatar
wxchan committed
1297
        else:
Nikita Titov's avatar
Nikita Titov committed
1298
            raise TypeError("Expected np.float32/64 or np.int32, met type({})".format(data.dtype))
wxchan's avatar
wxchan committed
1299
        if type_data != FIELD_TYPE_MAPPER[field_name]:
1300
            raise TypeError("Input type error for set_field")
wxchan's avatar
wxchan committed
1301
1302
1303
1304
        _safe_call(_LIB.LGBM_DatasetSetField(
            self.handle,
            c_str(field_name),
            ptr_data,
Guolin Ke's avatar
Guolin Ke committed
1305
1306
            ctypes.c_int(len(data)),
            ctypes.c_int(type_data)))
1307
        self.version += 1
Nikita Titov's avatar
Nikita Titov committed
1308
        return self
wxchan's avatar
wxchan committed
1309

wxchan's avatar
wxchan committed
1310
1311
    def get_field(self, field_name):
        """Get property from the Dataset.
wxchan's avatar
wxchan committed
1312
1313
1314

        Parameters
        ----------
Nikita Titov's avatar
Nikita Titov committed
1315
        field_name : string
1316
            The field name of the information.
wxchan's avatar
wxchan committed
1317
1318
1319

        Returns
        -------
1320
1321
        info : numpy array
            A numpy array with information from the Dataset.
Guolin Ke's avatar
Guolin Ke committed
1322
        """
1323
        if self.handle is None:
1324
            raise Exception("Cannot get %s before construct Dataset" % field_name)
Guolin Ke's avatar
Guolin Ke committed
1325
1326
        tmp_out_len = ctypes.c_int()
        out_type = ctypes.c_int()
wxchan's avatar
wxchan committed
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
        ret = ctypes.POINTER(ctypes.c_void_p)()
        _safe_call(_LIB.LGBM_DatasetGetField(
            self.handle,
            c_str(field_name),
            ctypes.byref(tmp_out_len),
            ctypes.byref(ret),
            ctypes.byref(out_type)))
        if out_type.value != FIELD_TYPE_MAPPER[field_name]:
            raise TypeError("Return type error for get_field")
        if tmp_out_len.value == 0:
            return None
        if out_type.value == C_API_DTYPE_INT32:
            return cint32_array_to_numpy(ctypes.cast(ret, ctypes.POINTER(ctypes.c_int32)), tmp_out_len.value)
        elif out_type.value == C_API_DTYPE_FLOAT32:
            return cfloat32_array_to_numpy(ctypes.cast(ret, ctypes.POINTER(ctypes.c_float)), tmp_out_len.value)
Guolin Ke's avatar
Guolin Ke committed
1342
1343
        elif out_type.value == C_API_DTYPE_FLOAT64:
            return cfloat64_array_to_numpy(ctypes.cast(ret, ctypes.POINTER(ctypes.c_double)), tmp_out_len.value)
1344
        else:
wxchan's avatar
wxchan committed
1345
            raise TypeError("Unknown type")
Guolin Ke's avatar
Guolin Ke committed
1346

1347
    def set_categorical_feature(self, categorical_feature):
1348
        """Set categorical features.
1349
1350
1351

        Parameters
        ----------
1352
1353
        categorical_feature : list of int or strings
            Names or indices of categorical features.
Nikita Titov's avatar
Nikita Titov committed
1354
1355
1356
1357
1358

        Returns
        -------
        self : Dataset
            Dataset with set categorical features.
1359
1360
        """
        if self.categorical_feature == categorical_feature:
Nikita Titov's avatar
Nikita Titov committed
1361
            return self
1362
        if self.data is not None:
1363
1364
            if self.categorical_feature is None:
                self.categorical_feature = categorical_feature
Nikita Titov's avatar
Nikita Titov committed
1365
                return self._free_handle()
1366
1367
            elif categorical_feature == 'auto':
                warnings.warn('Using categorical_feature in Dataset.')
Nikita Titov's avatar
Nikita Titov committed
1368
                return self
1369
            else:
1370
1371
                warnings.warn('categorical_feature in Dataset is overridden.\n'
                              'New categorical_feature is {}'.format(sorted(list(categorical_feature))))
1372
                self.categorical_feature = categorical_feature
Nikita Titov's avatar
Nikita Titov committed
1373
                return self._free_handle()
1374
        else:
1375
1376
            raise LightGBMError("Cannot set categorical feature after freed raw data, "
                                "set free_raw_data=False when construct Dataset to avoid this.")
1377

Guolin Ke's avatar
Guolin Ke committed
1378
    def _set_predictor(self, predictor):
1379
1380
1381
1382
        """Set predictor for continued training.

        It is not recommended for user to call this function.
        Please use init_model argument in engine.train() or engine.cv() instead.
Guolin Ke's avatar
Guolin Ke committed
1383
1384
        """
        if predictor is self._predictor:
Nikita Titov's avatar
Nikita Titov committed
1385
            return self
1386
1387
1388
        if self.data is not None or (self.used_indices is not None
                                     and self.reference is not None
                                     and self.reference.data is not None):
Guolin Ke's avatar
Guolin Ke committed
1389
            self._predictor = predictor
Nikita Titov's avatar
Nikita Titov committed
1390
            return self._free_handle()
Guolin Ke's avatar
Guolin Ke committed
1391
        else:
1392
1393
            raise LightGBMError("Cannot set predictor after freed raw data, "
                                "set free_raw_data=False when construct Dataset to avoid this.")
Guolin Ke's avatar
Guolin Ke committed
1394
1395

    def set_reference(self, reference):
1396
        """Set reference Dataset.
Guolin Ke's avatar
Guolin Ke committed
1397
1398
1399
1400

        Parameters
        ----------
        reference : Dataset
1401
            Reference that is used as a template to construct the current Dataset.
Nikita Titov's avatar
Nikita Titov committed
1402
1403
1404
1405
1406

        Returns
        -------
        self : Dataset
            Dataset with set reference.
Guolin Ke's avatar
Guolin Ke committed
1407
        """
1408
1409
1410
        self.set_categorical_feature(reference.categorical_feature) \
            .set_feature_name(reference.feature_name) \
            ._set_predictor(reference._predictor)
1411
1412
        # we're done if self and reference share a common upstrem reference
        if self.get_ref_chain().intersection(reference.get_ref_chain()):
Nikita Titov's avatar
Nikita Titov committed
1413
            return self
Guolin Ke's avatar
Guolin Ke committed
1414
1415
        if self.data is not None:
            self.reference = reference
Nikita Titov's avatar
Nikita Titov committed
1416
            return self._free_handle()
Guolin Ke's avatar
Guolin Ke committed
1417
        else:
1418
1419
            raise LightGBMError("Cannot set reference after freed raw data, "
                                "set free_raw_data=False when construct Dataset to avoid this.")
Guolin Ke's avatar
Guolin Ke committed
1420
1421

    def set_feature_name(self, feature_name):
1422
        """Set feature name.
Guolin Ke's avatar
Guolin Ke committed
1423
1424
1425

        Parameters
        ----------
1426
1427
        feature_name : list of strings
            Feature names.
Nikita Titov's avatar
Nikita Titov committed
1428
1429
1430
1431
1432

        Returns
        -------
        self : Dataset
            Dataset with set feature name.
Guolin Ke's avatar
Guolin Ke committed
1433
        """
1434
1435
        if feature_name != 'auto':
            self.feature_name = feature_name
1436
        if self.handle is not None and feature_name is not None and feature_name != 'auto':
wxchan's avatar
wxchan committed
1437
            if len(feature_name) != self.num_feature():
1438
1439
                raise ValueError("Length of feature_name({}) and num_feature({}) don't match"
                                 .format(len(feature_name), self.num_feature()))
1440
            c_feature_name = [c_str(name) for name in feature_name]
wxchan's avatar
wxchan committed
1441
1442
1443
            _safe_call(_LIB.LGBM_DatasetSetFeatureNames(
                self.handle,
                c_array(ctypes.c_char_p, c_feature_name),
Guolin Ke's avatar
Guolin Ke committed
1444
                ctypes.c_int(len(feature_name))))
Nikita Titov's avatar
Nikita Titov committed
1445
        return self
Guolin Ke's avatar
Guolin Ke committed
1446
1447

    def set_label(self, label):
1448
        """Set label of Dataset.
Guolin Ke's avatar
Guolin Ke committed
1449
1450
1451

        Parameters
        ----------
1452
        label : list, numpy 1-D array, pandas Series / one-column DataFrame or None
1453
            The label information to be set into Dataset.
Nikita Titov's avatar
Nikita Titov committed
1454
1455
1456
1457
1458

        Returns
        -------
        self : Dataset
            Dataset with set label.
Guolin Ke's avatar
Guolin Ke committed
1459
1460
        """
        self.label = label
1461
        if self.handle is not None:
1462
            label = list_to_1d_numpy(_label_from_pandas(label), name='label')
wxchan's avatar
wxchan committed
1463
            self.set_field('label', label)
1464
            self.label = self.get_field('label')  # original values can be modified at cpp side
Nikita Titov's avatar
Nikita Titov committed
1465
        return self
Guolin Ke's avatar
Guolin Ke committed
1466
1467

    def set_weight(self, weight):
1468
        """Set weight of each instance.
Guolin Ke's avatar
Guolin Ke committed
1469
1470
1471

        Parameters
        ----------
1472
        weight : list, numpy 1-D array, pandas Series or None
1473
            Weight to be set for each data point.
Nikita Titov's avatar
Nikita Titov committed
1474
1475
1476
1477
1478

        Returns
        -------
        self : Dataset
            Dataset with set weight.
Guolin Ke's avatar
Guolin Ke committed
1479
        """
1480
1481
        if weight is not None and np.all(weight == 1):
            weight = None
Guolin Ke's avatar
Guolin Ke committed
1482
        self.weight = weight
1483
        if self.handle is not None and weight is not None:
wxchan's avatar
wxchan committed
1484
1485
            weight = list_to_1d_numpy(weight, name='weight')
            self.set_field('weight', weight)
1486
            self.weight = self.get_field('weight')  # original values can be modified at cpp side
Nikita Titov's avatar
Nikita Titov committed
1487
        return self
Guolin Ke's avatar
Guolin Ke committed
1488
1489

    def set_init_score(self, init_score):
1490
        """Set init score of Booster to start from.
Guolin Ke's avatar
Guolin Ke committed
1491
1492
1493

        Parameters
        ----------
1494
        init_score : list, numpy 1-D array, pandas Series or None
1495
            Init score for Booster.
Nikita Titov's avatar
Nikita Titov committed
1496
1497
1498
1499
1500

        Returns
        -------
        self : Dataset
            Dataset with set init score.
Guolin Ke's avatar
Guolin Ke committed
1501
1502
        """
        self.init_score = init_score
1503
        if self.handle is not None and init_score is not None:
Guolin Ke's avatar
Guolin Ke committed
1504
            init_score = list_to_1d_numpy(init_score, np.float64, name='init_score')
wxchan's avatar
wxchan committed
1505
            self.set_field('init_score', init_score)
1506
            self.init_score = self.get_field('init_score')  # original values can be modified at cpp side
Nikita Titov's avatar
Nikita Titov committed
1507
        return self
Guolin Ke's avatar
Guolin Ke committed
1508
1509

    def set_group(self, group):
1510
        """Set group size of Dataset (used for ranking).
Guolin Ke's avatar
Guolin Ke committed
1511
1512
1513

        Parameters
        ----------
1514
        group : list, numpy 1-D array, pandas Series or None
1515
            Group size of each group.
Nikita Titov's avatar
Nikita Titov committed
1516
1517
1518
1519
1520

        Returns
        -------
        self : Dataset
            Dataset with set group.
Guolin Ke's avatar
Guolin Ke committed
1521
1522
        """
        self.group = group
1523
        if self.handle is not None and group is not None:
wxchan's avatar
wxchan committed
1524
1525
            group = list_to_1d_numpy(group, np.int32, name='group')
            self.set_field('group', group)
Nikita Titov's avatar
Nikita Titov committed
1526
        return self
Guolin Ke's avatar
Guolin Ke committed
1527
1528

    def get_label(self):
1529
        """Get the label of the Dataset.
Guolin Ke's avatar
Guolin Ke committed
1530
1531
1532

        Returns
        -------
Nikita Titov's avatar
Nikita Titov committed
1533
        label : numpy array or None
1534
            The label information from the Dataset.
Guolin Ke's avatar
Guolin Ke committed
1535
        """
1536
        if self.label is None:
wxchan's avatar
wxchan committed
1537
            self.label = self.get_field('label')
Guolin Ke's avatar
Guolin Ke committed
1538
1539
1540
        return self.label

    def get_weight(self):
1541
        """Get the weight of the Dataset.
Guolin Ke's avatar
Guolin Ke committed
1542
1543
1544

        Returns
        -------
Nikita Titov's avatar
Nikita Titov committed
1545
        weight : numpy array or None
1546
            Weight for each data point from the Dataset.
Guolin Ke's avatar
Guolin Ke committed
1547
        """
1548
        if self.weight is None:
wxchan's avatar
wxchan committed
1549
            self.weight = self.get_field('weight')
Guolin Ke's avatar
Guolin Ke committed
1550
1551
1552
        return self.weight

    def get_init_score(self):
1553
        """Get the initial score of the Dataset.
Guolin Ke's avatar
Guolin Ke committed
1554
1555
1556

        Returns
        -------
Nikita Titov's avatar
Nikita Titov committed
1557
        init_score : numpy array or None
1558
            Init score of Booster.
Guolin Ke's avatar
Guolin Ke committed
1559
        """
1560
        if self.init_score is None:
wxchan's avatar
wxchan committed
1561
            self.init_score = self.get_field('init_score')
Guolin Ke's avatar
Guolin Ke committed
1562
1563
        return self.init_score

1564
1565
1566
1567
1568
    def get_data(self):
        """Get the raw data of the Dataset.

        Returns
        -------
1569
        data : string, numpy array, pandas DataFrame, H2O DataTable's Frame, scipy.sparse, list of numpy arrays or None
1570
1571
1572
1573
            Raw data used in the Dataset construction.
        """
        if self.handle is None:
            raise Exception("Cannot get data before construct Dataset")
Guolin Ke's avatar
Guolin Ke committed
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
        if self.need_slice and self.used_indices is not None and self.reference is not None:
            self.data = self.reference.data
            if self.data is not None:
                if isinstance(self.data, np.ndarray) or scipy.sparse.issparse(self.data):
                    self.data = self.data[self.used_indices, :]
                elif isinstance(self.data, DataFrame):
                    self.data = self.data.iloc[self.used_indices].copy()
                elif isinstance(self.data, DataTable):
                    self.data = self.data[self.used_indices, :]
                else:
                    warnings.warn("Cannot subset {} type of raw data.\n"
                                  "Returning original raw data".format(type(self.data).__name__))
1586
            self.need_slice = False
Guolin Ke's avatar
Guolin Ke committed
1587
1588
1589
        if self.data is None:
            raise LightGBMError("Cannot call `get_data` after freed raw data, "
                                "set free_raw_data=False when construct Dataset to avoid this.")
1590
1591
        return self.data

Guolin Ke's avatar
Guolin Ke committed
1592
    def get_group(self):
1593
        """Get the group of the Dataset.
Guolin Ke's avatar
Guolin Ke committed
1594
1595
1596

        Returns
        -------
Nikita Titov's avatar
Nikita Titov committed
1597
        group : numpy array or None
1598
            Group size of each group.
Guolin Ke's avatar
Guolin Ke committed
1599
        """
1600
        if self.group is None:
wxchan's avatar
wxchan committed
1601
            self.group = self.get_field('group')
Guolin Ke's avatar
Guolin Ke committed
1602
1603
            if self.group is not None:
                # group data from LightGBM is boundaries data, need to convert to group size
Nikita Titov's avatar
Nikita Titov committed
1604
                self.group = np.diff(self.group)
Guolin Ke's avatar
Guolin Ke committed
1605
1606
1607
        return self.group

    def num_data(self):
1608
        """Get the number of rows in the Dataset.
Guolin Ke's avatar
Guolin Ke committed
1609
1610
1611

        Returns
        -------
1612
1613
        number_of_rows : int
            The number of rows in the Dataset.
Guolin Ke's avatar
Guolin Ke committed
1614
        """
1615
        if self.handle is not None:
Guolin Ke's avatar
Guolin Ke committed
1616
            ret = ctypes.c_int()
wxchan's avatar
wxchan committed
1617
1618
1619
            _safe_call(_LIB.LGBM_DatasetGetNumData(self.handle,
                                                   ctypes.byref(ret)))
            return ret.value
Guolin Ke's avatar
Guolin Ke committed
1620
        else:
1621
            raise LightGBMError("Cannot get num_data before construct dataset")
Guolin Ke's avatar
Guolin Ke committed
1622
1623

    def num_feature(self):
1624
        """Get the number of columns (features) in the Dataset.
Guolin Ke's avatar
Guolin Ke committed
1625
1626
1627

        Returns
        -------
1628
1629
        number_of_columns : int
            The number of columns (features) in the Dataset.
Guolin Ke's avatar
Guolin Ke committed
1630
        """
1631
        if self.handle is not None:
Guolin Ke's avatar
Guolin Ke committed
1632
            ret = ctypes.c_int()
wxchan's avatar
wxchan committed
1633
1634
1635
            _safe_call(_LIB.LGBM_DatasetGetNumFeature(self.handle,
                                                      ctypes.byref(ret)))
            return ret.value
Guolin Ke's avatar
Guolin Ke committed
1636
        else:
1637
            raise LightGBMError("Cannot get num_feature before construct dataset")
Guolin Ke's avatar
Guolin Ke committed
1638

1639
    def get_ref_chain(self, ref_limit=100):
1640
1641
1642
1643
1644
        """Get a chain of Dataset objects.

        Starts with r, then goes to r.reference (if exists),
        then to r.reference.reference, etc.
        until we hit ``ref_limit`` or a reference loop.
1645
1646
1647
1648
1649

        Parameters
        ----------
        ref_limit : int, optional (default=100)
            The limit number of references.
1650
1651
1652

        Returns
        -------
1653
1654
1655
        ref_chain : set of Dataset
            Chain of references of the Datasets.
        """
1656
        head = self
1657
        ref_chain = set()
1658
1659
        while len(ref_chain) < ref_limit:
            if isinstance(head, Dataset):
1660
                ref_chain.add(head)
1661
1662
1663
1664
1665
1666
                if (head.reference is not None) and (head.reference not in ref_chain):
                    head = head.reference
                else:
                    break
            else:
                break
Nikita Titov's avatar
Nikita Titov committed
1667
        return ref_chain
1668

1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
    def add_features_from(self, other):
        """Add features from other Dataset to the current Dataset.

        Both Datasets must be constructed before calling this method.

        Parameters
        ----------
        other : Dataset
            The Dataset to take features from.

        Returns
        -------
        self : Dataset
            Dataset with the new features added.
        """
        if self.handle is None or other.handle is None:
            raise ValueError('Both source and target Datasets must be constructed before adding features')
        _safe_call(_LIB.LGBM_DatasetAddFeaturesFrom(self.handle, other.handle))
        return self

1689
    def _dump_text(self, filename):
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
        """Save Dataset to a text file.

        This format cannot be loaded back in by LightGBM, but is useful for debugging purposes.

        Parameters
        ----------
        filename : string
            Name of the output file.

        Returns
        -------
        self : Dataset
            Returns self.
        """
        _safe_call(_LIB.LGBM_DatasetDumpText(
            self.construct().handle,
            c_str(filename)))
        return self

wxchan's avatar
wxchan committed
1709

wxchan's avatar
wxchan committed
1710
class Booster(object):
1711
    """Booster in LightGBM."""
1712

1713
    def __init__(self, params=None, train_set=None, model_file=None, model_str=None, silent=False):
1714
        """Initialize the Booster.
wxchan's avatar
wxchan committed
1715
1716
1717

        Parameters
        ----------
Nikita Titov's avatar
Nikita Titov committed
1718
        params : dict or None, optional (default=None)
1719
1720
1721
1722
            Parameters for Booster.
        train_set : Dataset or None, optional (default=None)
            Training dataset.
        model_file : string or None, optional (default=None)
wxchan's avatar
wxchan committed
1723
            Path to the model file.
1724
1725
        model_str : string or None, optional (default=None)
            Model will be loaded from this string.
1726
1727
        silent : bool, optional (default=False)
            Whether to print messages during construction.
wxchan's avatar
wxchan committed
1728
        """
1729
        self.handle = None
1730
        self.network = False
wxchan's avatar
wxchan committed
1731
        self.__need_reload_eval_info = True
1732
        self._train_data_name = "training"
wxchan's avatar
wxchan committed
1733
        self.__attr = {}
1734
        self.__set_objective_to_none = False
wxchan's avatar
wxchan committed
1735
        self.best_iteration = -1
wxchan's avatar
wxchan committed
1736
        self.best_score = {}
1737
        params = {} if params is None else copy.deepcopy(params)
1738
        # user can set verbose with params, it has higher priority
1739
        if not any(verbose_alias in params for verbose_alias in _ConfigAliases.get("verbosity")) and silent:
1740
            params["verbose"] = -1
wxchan's avatar
wxchan committed
1741
        if train_set is not None:
1742
            # Training task
wxchan's avatar
wxchan committed
1743
            if not isinstance(train_set, Dataset):
1744
1745
                raise TypeError('Training data should be Dataset instance, met {}'
                                .format(type(train_set).__name__))
1746
            # set network if necessary
1747
            for alias in _ConfigAliases.get("machines"):
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
                if alias in params:
                    machines = params[alias]
                    if isinstance(machines, string_type):
                        num_machines = len(machines.split(','))
                    elif isinstance(machines, (list, set)):
                        num_machines = len(machines)
                        machines = ','.join(machines)
                    else:
                        raise ValueError("Invalid machines in params.")
                    self.set_network(machines,
                                     local_listen_port=params.get("local_listen_port", 12400),
                                     listen_time_out=params.get("listen_time_out", 120),
1760
                                     num_machines=params.setdefault("num_machines", num_machines))
1761
                    break
1762
            # construct booster object
1763
1764
1765
1766
            train_set.construct()
            # copy the parameters from train_set
            params.update(train_set.get_params())
            params_str = param_dict_to_str(params)
1767
            self.handle = ctypes.c_void_p()
wxchan's avatar
wxchan committed
1768
            _safe_call(_LIB.LGBM_BoosterCreate(
1769
                train_set.handle,
wxchan's avatar
wxchan committed
1770
1771
                c_str(params_str),
                ctypes.byref(self.handle)))
1772
            # save reference to data
wxchan's avatar
wxchan committed
1773
1774
1775
1776
            self.train_set = train_set
            self.valid_sets = []
            self.name_valid_sets = []
            self.__num_dataset = 1
Guolin Ke's avatar
Guolin Ke committed
1777
1778
            self.__init_predictor = train_set._predictor
            if self.__init_predictor is not None:
wxchan's avatar
wxchan committed
1779
1780
                _safe_call(_LIB.LGBM_BoosterMerge(
                    self.handle,
Guolin Ke's avatar
Guolin Ke committed
1781
                    self.__init_predictor.handle))
Guolin Ke's avatar
Guolin Ke committed
1782
            out_num_class = ctypes.c_int(0)
wxchan's avatar
wxchan committed
1783
1784
1785
1786
            _safe_call(_LIB.LGBM_BoosterGetNumClasses(
                self.handle,
                ctypes.byref(out_num_class)))
            self.__num_class = out_num_class.value
1787
            # buffer for inner predict
wxchan's avatar
wxchan committed
1788
1789
1790
            self.__inner_predict_buffer = [None]
            self.__is_predicted_cur_iter = [False]
            self.__get_eval_info()
1791
            self.pandas_categorical = train_set.pandas_categorical
1792
            self.train_set_version = train_set.version
wxchan's avatar
wxchan committed
1793
        elif model_file is not None:
1794
            # Prediction task
Guolin Ke's avatar
Guolin Ke committed
1795
            out_num_iterations = ctypes.c_int(0)
1796
            self.handle = ctypes.c_void_p()
wxchan's avatar
wxchan committed
1797
1798
1799
1800
            _safe_call(_LIB.LGBM_BoosterCreateFromModelfile(
                c_str(model_file),
                ctypes.byref(out_num_iterations),
                ctypes.byref(self.handle)))
Guolin Ke's avatar
Guolin Ke committed
1801
            out_num_class = ctypes.c_int(0)
wxchan's avatar
wxchan committed
1802
1803
1804
1805
            _safe_call(_LIB.LGBM_BoosterGetNumClasses(
                self.handle,
                ctypes.byref(out_num_class)))
            self.__num_class = out_num_class.value
1806
            self.pandas_categorical = _load_pandas_categorical(file_name=model_file)
1807
1808
        elif model_str is not None:
            self.model_from_string(model_str, not silent)
wxchan's avatar
wxchan committed
1809
        else:
1810
1811
            raise TypeError('Need at least one training dataset or model file or model string '
                            'to create Booster instance')
1812
        self.params = params
wxchan's avatar
wxchan committed
1813
1814

    def __del__(self):
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
        try:
            if self.network:
                self.free_network()
        except AttributeError:
            pass
        try:
            if self.handle is not None:
                _safe_call(_LIB.LGBM_BoosterFree(self.handle))
        except AttributeError:
            pass
wxchan's avatar
wxchan committed
1825

wxchan's avatar
wxchan committed
1826
1827
1828
1829
    def __copy__(self):
        return self.__deepcopy__(None)

    def __deepcopy__(self, _):
1830
        model_str = self.model_to_string(num_iteration=-1)
1831
        booster = Booster(model_str=model_str)
1832
        return booster
wxchan's avatar
wxchan committed
1833
1834
1835
1836
1837
1838
1839

    def __getstate__(self):
        this = self.__dict__.copy()
        handle = this['handle']
        this.pop('train_set', None)
        this.pop('valid_sets', None)
        if handle is not None:
1840
            this["handle"] = self.model_to_string(num_iteration=-1)
wxchan's avatar
wxchan committed
1841
1842
1843
        return this

    def __setstate__(self, state):
1844
1845
        model_str = state.get('handle', None)
        if model_str is not None:
wxchan's avatar
wxchan committed
1846
            handle = ctypes.c_void_p()
Guolin Ke's avatar
Guolin Ke committed
1847
            out_num_iterations = ctypes.c_int(0)
1848
1849
1850
1851
            _safe_call(_LIB.LGBM_BoosterLoadModelFromString(
                c_str(model_str),
                ctypes.byref(out_num_iterations),
                ctypes.byref(handle)))
wxchan's avatar
wxchan committed
1852
1853
1854
            state['handle'] = handle
        self.__dict__.update(state)

wxchan's avatar
wxchan committed
1855
    def free_dataset(self):
Nikita Titov's avatar
Nikita Titov committed
1856
1857
1858
1859
1860
1861
1862
        """Free Booster's Datasets.

        Returns
        -------
        self : Booster
            Booster without Datasets.
        """
wxchan's avatar
wxchan committed
1863
1864
        self.__dict__.pop('train_set', None)
        self.__dict__.pop('valid_sets', None)
1865
        self.__num_dataset = 0
Nikita Titov's avatar
Nikita Titov committed
1866
        return self
wxchan's avatar
wxchan committed
1867

1868
1869
1870
    def _free_buffer(self):
        self.__inner_predict_buffer = []
        self.__is_predicted_cur_iter = []
Nikita Titov's avatar
Nikita Titov committed
1871
        return self
1872

1873
1874
1875
1876
1877
1878
    def set_network(self, machines, local_listen_port=12400,
                    listen_time_out=120, num_machines=1):
        """Set the network configuration.

        Parameters
        ----------
Nikita Titov's avatar
Nikita Titov committed
1879
        machines : list, set or string
1880
            Names of machines.
Nikita Titov's avatar
Nikita Titov committed
1881
        local_listen_port : int, optional (default=12400)
1882
            TCP listen port for local machines.
Nikita Titov's avatar
Nikita Titov committed
1883
        listen_time_out : int, optional (default=120)
1884
            Socket time-out in minutes.
Nikita Titov's avatar
Nikita Titov committed
1885
        num_machines : int, optional (default=1)
1886
            The number of machines for parallel learning application.
Nikita Titov's avatar
Nikita Titov committed
1887
1888
1889
1890
1891

        Returns
        -------
        self : Booster
            Booster with set network.
1892
1893
1894
1895
1896
1897
        """
        _safe_call(_LIB.LGBM_NetworkInit(c_str(machines),
                                         ctypes.c_int(local_listen_port),
                                         ctypes.c_int(listen_time_out),
                                         ctypes.c_int(num_machines)))
        self.network = True
Nikita Titov's avatar
Nikita Titov committed
1898
        return self
1899
1900

    def free_network(self):
Nikita Titov's avatar
Nikita Titov committed
1901
1902
1903
1904
1905
1906
1907
        """Free Booster's network.

        Returns
        -------
        self : Booster
            Booster with freed network.
        """
1908
1909
        _safe_call(_LIB.LGBM_NetworkFree())
        self.network = False
Nikita Titov's avatar
Nikita Titov committed
1910
        return self
1911

1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
    def trees_to_dataframe(self):
        """Parse the fitted model and return in an easy-to-read pandas DataFrame.

        Returns
        -------
        result : pandas DataFrame
            Returns a pandas DataFrame of the parsed model.
        """
        if not PANDAS_INSTALLED:
            raise LightGBMError('This method cannot be run without pandas installed')

        if self.num_trees() == 0:
            raise LightGBMError('There are no trees in this Booster and thus nothing to parse')

        def _is_split_node(tree):
            return 'split_index' in tree.keys()

        def create_node_record(tree, node_depth=1, tree_index=None,
                               feature_names=None, parent_node=None):

            def _get_node_index(tree, tree_index):
                tree_num = str(tree_index) + '-' if tree_index is not None else ''
                is_split = _is_split_node(tree)
                node_type = 'S' if is_split else 'L'
                # if a single node tree it won't have `leaf_index` so return 0
                node_num = str(tree.get('split_index' if is_split else 'leaf_index', 0))
                return tree_num + node_type + node_num

            def _get_split_feature(tree, feature_names):
                if _is_split_node(tree):
                    if feature_names is not None:
                        feature_name = feature_names[tree['split_feature']]
                    else:
                        feature_name = tree['split_feature']
                else:
                    feature_name = None
                return feature_name

            def _is_single_node_tree(tree):
1951
                return set(tree.keys()) == {'leaf_value'}
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026

            # Create the node record, and populate universal data members
            node = OrderedDict()
            node['tree_index'] = tree_index
            node['node_depth'] = node_depth
            node['node_index'] = _get_node_index(tree, tree_index)
            node['left_child'] = None
            node['right_child'] = None
            node['parent_index'] = parent_node
            node['split_feature'] = _get_split_feature(tree, feature_names)
            node['split_gain'] = None
            node['threshold'] = None
            node['decision_type'] = None
            node['missing_direction'] = None
            node['missing_type'] = None
            node['value'] = None
            node['weight'] = None
            node['count'] = None

            # Update values to reflect node type (leaf or split)
            if _is_split_node(tree):
                node['left_child'] = _get_node_index(tree['left_child'], tree_index)
                node['right_child'] = _get_node_index(tree['right_child'], tree_index)
                node['split_gain'] = tree['split_gain']
                node['threshold'] = tree['threshold']
                node['decision_type'] = tree['decision_type']
                node['missing_direction'] = 'left' if tree['default_left'] else 'right'
                node['missing_type'] = tree['missing_type']
                node['value'] = tree['internal_value']
                node['weight'] = tree['internal_weight']
                node['count'] = tree['internal_count']
            else:
                node['value'] = tree['leaf_value']
                if not _is_single_node_tree(tree):
                    node['weight'] = tree['leaf_weight']
                    node['count'] = tree['leaf_count']

            return node

        def tree_dict_to_node_list(tree, node_depth=1, tree_index=None,
                                   feature_names=None, parent_node=None):

            node = create_node_record(tree,
                                      node_depth=node_depth,
                                      tree_index=tree_index,
                                      feature_names=feature_names,
                                      parent_node=parent_node)

            res = [node]

            if _is_split_node(tree):
                # traverse the next level of the tree
                children = ['left_child', 'right_child']
                for child in children:
                    subtree_list = tree_dict_to_node_list(
                        tree[child],
                        node_depth=node_depth + 1,
                        tree_index=tree_index,
                        feature_names=feature_names,
                        parent_node=node['node_index'])
                    # In tree format, "subtree_list" is a list of node records (dicts),
                    # and we add node to the list.
                    res.extend(subtree_list)
            return res

        model_dict = self.dump_model()
        feature_names = model_dict['feature_names']
        model_list = []
        for tree in model_dict['tree_info']:
            model_list.extend(tree_dict_to_node_list(tree['tree_structure'],
                                                     tree_index=tree['tree_index'],
                                                     feature_names=feature_names))

        return DataFrame(model_list, columns=model_list[0].keys())

wxchan's avatar
wxchan committed
2027
    def set_train_data_name(self, name):
2028
2029
2030
2031
        """Set the name to the training Dataset.

        Parameters
        ----------
Nikita Titov's avatar
Nikita Titov committed
2032
2033
2034
2035
2036
2037
2038
        name : string
            Name for the training Dataset.

        Returns
        -------
        self : Booster
            Booster with set training Dataset name.
2039
        """
2040
        self._train_data_name = name
Nikita Titov's avatar
Nikita Titov committed
2041
        return self
wxchan's avatar
wxchan committed
2042
2043

    def add_valid(self, data, name):
2044
        """Add validation data.
wxchan's avatar
wxchan committed
2045
2046
2047
2048

        Parameters
        ----------
        data : Dataset
2049
2050
2051
            Validation data.
        name : string
            Name of validation data.
Nikita Titov's avatar
Nikita Titov committed
2052
2053
2054
2055
2056

        Returns
        -------
        self : Booster
            Booster with set validation data.
wxchan's avatar
wxchan committed
2057
        """
Guolin Ke's avatar
Guolin Ke committed
2058
        if not isinstance(data, Dataset):
2059
2060
            raise TypeError('Validation data should be Dataset instance, met {}'
                            .format(type(data).__name__))
Guolin Ke's avatar
Guolin Ke committed
2061
        if data._predictor is not self.__init_predictor:
2062
2063
            raise LightGBMError("Add validation data failed, "
                                "you should use same predictor for these data")
wxchan's avatar
wxchan committed
2064
2065
        _safe_call(_LIB.LGBM_BoosterAddValidData(
            self.handle,
wxchan's avatar
wxchan committed
2066
            data.construct().handle))
wxchan's avatar
wxchan committed
2067
2068
2069
2070
2071
        self.valid_sets.append(data)
        self.name_valid_sets.append(name)
        self.__num_dataset += 1
        self.__inner_predict_buffer.append(None)
        self.__is_predicted_cur_iter.append(False)
Nikita Titov's avatar
Nikita Titov committed
2072
        return self
wxchan's avatar
wxchan committed
2073
2074

    def reset_parameter(self, params):
2075
        """Reset parameters of Booster.
wxchan's avatar
wxchan committed
2076
2077
2078
2079

        Parameters
        ----------
        params : dict
2080
            New parameters for Booster.
Nikita Titov's avatar
Nikita Titov committed
2081
2082
2083
2084
2085

        Returns
        -------
        self : Booster
            Booster with new parameters.
wxchan's avatar
wxchan committed
2086
2087
2088
2089
2090
2091
        """
        params_str = param_dict_to_str(params)
        if params_str:
            _safe_call(_LIB.LGBM_BoosterResetParameter(
                self.handle,
                c_str(params_str)))
Guolin Ke's avatar
Guolin Ke committed
2092
        self.params.update(params)
Nikita Titov's avatar
Nikita Titov committed
2093
        return self
wxchan's avatar
wxchan committed
2094
2095

    def update(self, train_set=None, fobj=None):
Nikita Titov's avatar
Nikita Titov committed
2096
        """Update Booster for one iteration.
2097

wxchan's avatar
wxchan committed
2098
2099
        Parameters
        ----------
2100
2101
2102
2103
        train_set : Dataset or None, optional (default=None)
            Training data.
            If None, last training data is used.
        fobj : callable or None, optional (default=None)
wxchan's avatar
wxchan committed
2104
            Customized objective function.
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
            Should accept two parameters: preds, train_data,
            and return (grad, hess).

                preds : list or numpy 1-D array
                    The predicted values.
                train_data : Dataset
                    The training dataset.
                grad : list or numpy 1-D array
                    The value of the first order derivative (gradient) for each sample point.
                hess : list or numpy 1-D array
                    The value of the second order derivative (Hessian) for each sample point.
wxchan's avatar
wxchan committed
2116

2117
2118
            For multi-class task, the preds is group by class_id first, then group by row_id.
            If you want to get i-th row preds in j-th class, the access way is score[j * num_data + i]
2119
2120
            and you should group grad and hess in this way as well.

wxchan's avatar
wxchan committed
2121
2122
        Returns
        -------
2123
2124
        is_finished : bool
            Whether the update was successfully finished.
wxchan's avatar
wxchan committed
2125
        """
2126
        # need reset training data
2127
2128
2129
2130
2131
2132
        if train_set is None and self.train_set_version != self.train_set.version:
            train_set = self.train_set
            is_the_same_train_set = False
        else:
            is_the_same_train_set = train_set is self.train_set and self.train_set_version == train_set.version
        if train_set is not None and not is_the_same_train_set:
Guolin Ke's avatar
Guolin Ke committed
2133
            if not isinstance(train_set, Dataset):
2134
2135
                raise TypeError('Training data should be Dataset instance, met {}'
                                .format(type(train_set).__name__))
Guolin Ke's avatar
Guolin Ke committed
2136
            if train_set._predictor is not self.__init_predictor:
2137
2138
                raise LightGBMError("Replace training data failed, "
                                    "you should use same predictor for these data")
wxchan's avatar
wxchan committed
2139
2140
2141
            self.train_set = train_set
            _safe_call(_LIB.LGBM_BoosterResetTrainingData(
                self.handle,
wxchan's avatar
wxchan committed
2142
                self.train_set.construct().handle))
wxchan's avatar
wxchan committed
2143
            self.__inner_predict_buffer[0] = None
2144
            self.train_set_version = self.train_set.version
wxchan's avatar
wxchan committed
2145
2146
        is_finished = ctypes.c_int(0)
        if fobj is None:
2147
            if self.__set_objective_to_none:
2148
                raise LightGBMError('Cannot update due to null objective function.')
wxchan's avatar
wxchan committed
2149
2150
2151
            _safe_call(_LIB.LGBM_BoosterUpdateOneIter(
                self.handle,
                ctypes.byref(is_finished)))
wxchan's avatar
wxchan committed
2152
            self.__is_predicted_cur_iter = [False for _ in range_(self.__num_dataset)]
wxchan's avatar
wxchan committed
2153
2154
            return is_finished.value == 1
        else:
2155
            if not self.__set_objective_to_none:
Nikita Titov's avatar
Nikita Titov committed
2156
                self.reset_parameter({"objective": "none"}).__set_objective_to_none = True
wxchan's avatar
wxchan committed
2157
2158
2159
2160
            grad, hess = fobj(self.__inner_predict(0), self.train_set)
            return self.__boost(grad, hess)

    def __boost(self, grad, hess):
2161
        """Boost Booster for one iteration with customized gradient statistics.
Nikita Titov's avatar
Nikita Titov committed
2162

Nikita Titov's avatar
Nikita Titov committed
2163
2164
2165
2166
2167
        .. note::

            For multi-class task, the score is group by class_id first, then group by row_id.
            If you want to get i-th row score in j-th class, the access way is score[j * num_data + i]
            and you should group grad and hess in this way as well.
2168

wxchan's avatar
wxchan committed
2169
2170
        Parameters
        ----------
2171
        grad : list or numpy 1-D array
Nikita Titov's avatar
Nikita Titov committed
2172
            The first order derivative (gradient).
2173
        hess : list or numpy 1-D array
Nikita Titov's avatar
Nikita Titov committed
2174
            The second order derivative (Hessian).
wxchan's avatar
wxchan committed
2175
2176
2177

        Returns
        -------
Nikita Titov's avatar
Nikita Titov committed
2178
2179
        is_finished : bool
            Whether the boost was successfully finished.
wxchan's avatar
wxchan committed
2180
        """
2181
2182
        grad = list_to_1d_numpy(grad, name='gradient')
        hess = list_to_1d_numpy(hess, name='hessian')
2183
2184
        assert grad.flags.c_contiguous
        assert hess.flags.c_contiguous
wxchan's avatar
wxchan committed
2185
        if len(grad) != len(hess):
2186
2187
            raise ValueError("Lengths of gradient({}) and hessian({}) don't match"
                             .format(len(grad), len(hess)))
wxchan's avatar
wxchan committed
2188
2189
2190
2191
2192
2193
        is_finished = ctypes.c_int(0)
        _safe_call(_LIB.LGBM_BoosterUpdateOneIterCustom(
            self.handle,
            grad.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
            hess.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
            ctypes.byref(is_finished)))
wxchan's avatar
wxchan committed
2194
        self.__is_predicted_cur_iter = [False for _ in range_(self.__num_dataset)]
wxchan's avatar
wxchan committed
2195
2196
2197
        return is_finished.value == 1

    def rollback_one_iter(self):
Nikita Titov's avatar
Nikita Titov committed
2198
2199
2200
2201
2202
2203
2204
        """Rollback one iteration.

        Returns
        -------
        self : Booster
            Booster with rolled back one iteration.
        """
wxchan's avatar
wxchan committed
2205
2206
        _safe_call(_LIB.LGBM_BoosterRollbackOneIter(
            self.handle))
wxchan's avatar
wxchan committed
2207
        self.__is_predicted_cur_iter = [False for _ in range_(self.__num_dataset)]
Nikita Titov's avatar
Nikita Titov committed
2208
        return self
wxchan's avatar
wxchan committed
2209
2210

    def current_iteration(self):
2211
2212
2213
2214
2215
2216
2217
        """Get the index of the current iteration.

        Returns
        -------
        cur_iter : int
            The index of the current iteration.
        """
Guolin Ke's avatar
Guolin Ke committed
2218
        out_cur_iter = ctypes.c_int(0)
wxchan's avatar
wxchan committed
2219
2220
2221
2222
2223
        _safe_call(_LIB.LGBM_BoosterGetCurrentIteration(
            self.handle,
            ctypes.byref(out_cur_iter)))
        return out_cur_iter.value

2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
    def num_model_per_iteration(self):
        """Get number of models per iteration.

        Returns
        -------
        model_per_iter : int
            The number of models per iteration.
        """
        model_per_iter = ctypes.c_int(0)
        _safe_call(_LIB.LGBM_BoosterNumModelPerIteration(
            self.handle,
            ctypes.byref(model_per_iter)))
        return model_per_iter.value

    def num_trees(self):
        """Get number of weak sub-models.

        Returns
        -------
        num_trees : int
            The number of weak sub-models.
        """
        num_trees = ctypes.c_int(0)
        _safe_call(_LIB.LGBM_BoosterNumberOfTotalModel(
            self.handle,
            ctypes.byref(num_trees)))
        return num_trees.value

2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
    def upper_bound(self):
        """Get upper bound value of a model.

        Returns
        -------
        upper_bound : double
            Upper bound value of the model.
        """
        ret = ctypes.c_double(0)
        _safe_call(_LIB.LGBM_BoosterGetUpperBoundValue(
            self.handle,
            ctypes.byref(ret)))
        return ret.value

    def lower_bound(self):
        """Get lower bound value of a model.

        Returns
        -------
        lower_bound : double
            Lower bound value of the model.
        """
        ret = ctypes.c_double(0)
        _safe_call(_LIB.LGBM_BoosterGetLowerBoundValue(
            self.handle,
            ctypes.byref(ret)))
        return ret.value

wxchan's avatar
wxchan committed
2280
    def eval(self, data, name, feval=None):
2281
        """Evaluate for data.
wxchan's avatar
wxchan committed
2282
2283
2284

        Parameters
        ----------
2285
2286
2287
2288
2289
        data : Dataset
            Data for the evaluating.
        name : string
            Name of the data.
        feval : callable or None, optional (default=None)
2290
            Customized evaluation function.
2291
            Should accept two parameters: preds, eval_data,
2292
            and return (eval_name, eval_result, is_higher_better) or list of such tuples.
2293
2294
2295
2296
2297
2298

                preds : list or numpy 1-D array
                    The predicted values.
                eval_data : Dataset
                    The evaluation dataset.
                eval_name : string
2299
                    The name of evaluation function (without whitespaces).
2300
2301
2302
2303
2304
                eval_result : float
                    The eval result.
                is_higher_better : bool
                    Is eval result higher better, e.g. AUC is ``is_higher_better``.

2305
2306
            For multi-class task, the preds is group by class_id first, then group by row_id.
            If you want to get i-th row preds in j-th class, the access way is preds[j * num_data + i].
2307

wxchan's avatar
wxchan committed
2308
2309
        Returns
        -------
Nikita Titov's avatar
Nikita Titov committed
2310
        result : list
2311
            List with evaluation results.
wxchan's avatar
wxchan committed
2312
        """
Guolin Ke's avatar
Guolin Ke committed
2313
2314
        if not isinstance(data, Dataset):
            raise TypeError("Can only eval for Dataset instance")
wxchan's avatar
wxchan committed
2315
2316
2317
2318
        data_idx = -1
        if data is self.train_set:
            data_idx = 0
        else:
wxchan's avatar
wxchan committed
2319
            for i in range_(len(self.valid_sets)):
wxchan's avatar
wxchan committed
2320
2321
2322
                if data is self.valid_sets[i]:
                    data_idx = i + 1
                    break
2323
        # need to push new valid data
wxchan's avatar
wxchan committed
2324
2325
2326
2327
2328
2329
2330
        if data_idx == -1:
            self.add_valid(data, name)
            data_idx = self.__num_dataset - 1

        return self.__inner_eval(name, data_idx, feval)

    def eval_train(self, feval=None):
2331
        """Evaluate for training data.
wxchan's avatar
wxchan committed
2332
2333
2334

        Parameters
        ----------
2335
        feval : callable or None, optional (default=None)
2336
            Customized evaluation function.
2337
2338
            Should accept two parameters: preds, train_data,
            and return (eval_name, eval_result, is_higher_better) or list of such tuples.
2339
2340
2341
2342
2343
2344

                preds : list or numpy 1-D array
                    The predicted values.
                train_data : Dataset
                    The training dataset.
                eval_name : string
2345
                    The name of evaluation function (without whitespaces).
2346
2347
2348
2349
2350
                eval_result : float
                    The eval result.
                is_higher_better : bool
                    Is eval result higher better, e.g. AUC is ``is_higher_better``.

2351
2352
            For multi-class task, the preds is group by class_id first, then group by row_id.
            If you want to get i-th row preds in j-th class, the access way is preds[j * num_data + i].
wxchan's avatar
wxchan committed
2353
2354
2355

        Returns
        -------
Nikita Titov's avatar
Nikita Titov committed
2356
        result : list
2357
            List with evaluation results.
wxchan's avatar
wxchan committed
2358
        """
2359
        return self.__inner_eval(self._train_data_name, 0, feval)
wxchan's avatar
wxchan committed
2360
2361

    def eval_valid(self, feval=None):
2362
        """Evaluate for validation data.
wxchan's avatar
wxchan committed
2363
2364
2365

        Parameters
        ----------
2366
        feval : callable or None, optional (default=None)
2367
            Customized evaluation function.
2368
            Should accept two parameters: preds, valid_data,
2369
            and return (eval_name, eval_result, is_higher_better) or list of such tuples.
2370
2371
2372
2373
2374
2375

                preds : list or numpy 1-D array
                    The predicted values.
                valid_data : Dataset
                    The validation dataset.
                eval_name : string
2376
                    The name of evaluation function (without whitespaces).
2377
2378
2379
2380
2381
                eval_result : float
                    The eval result.
                is_higher_better : bool
                    Is eval result higher better, e.g. AUC is ``is_higher_better``.

2382
2383
            For multi-class task, the preds is group by class_id first, then group by row_id.
            If you want to get i-th row preds in j-th class, the access way is preds[j * num_data + i].
wxchan's avatar
wxchan committed
2384
2385
2386

        Returns
        -------
Nikita Titov's avatar
Nikita Titov committed
2387
        result : list
2388
            List with evaluation results.
wxchan's avatar
wxchan committed
2389
        """
wxchan's avatar
wxchan committed
2390
        return [item for i in range_(1, self.__num_dataset)
wxchan's avatar
wxchan committed
2391
                for item in self.__inner_eval(self.name_valid_sets[i - 1], i, feval)]
wxchan's avatar
wxchan committed
2392

2393
    def save_model(self, filename, num_iteration=None, start_iteration=0):
2394
        """Save Booster to file.
wxchan's avatar
wxchan committed
2395
2396
2397

        Parameters
        ----------
2398
2399
        filename : string
            Filename to save Booster.
2400
2401
2402
2403
        num_iteration : int or None, optional (default=None)
            Index of the iteration that should be saved.
            If None, if the best iteration exists, it is saved; otherwise, all iterations are saved.
            If <= 0, all iterations are saved.
Nikita Titov's avatar
Nikita Titov committed
2404
        start_iteration : int, optional (default=0)
2405
            Start index of the iteration that should be saved.
Nikita Titov's avatar
Nikita Titov committed
2406
2407
2408
2409
2410

        Returns
        -------
        self : Booster
            Returns self.
wxchan's avatar
wxchan committed
2411
        """
2412
        if num_iteration is None:
2413
            num_iteration = self.best_iteration
wxchan's avatar
wxchan committed
2414
2415
        _safe_call(_LIB.LGBM_BoosterSaveModel(
            self.handle,
2416
            ctypes.c_int(start_iteration),
Guolin Ke's avatar
Guolin Ke committed
2417
            ctypes.c_int(num_iteration),
wxchan's avatar
wxchan committed
2418
            c_str(filename)))
2419
        _dump_pandas_categorical(self.pandas_categorical, filename)
Nikita Titov's avatar
Nikita Titov committed
2420
        return self
wxchan's avatar
wxchan committed
2421

2422
    def shuffle_models(self, start_iteration=0, end_iteration=-1):
2423
        """Shuffle models.
Nikita Titov's avatar
Nikita Titov committed
2424

2425
2426
2427
        Parameters
        ----------
        start_iteration : int, optional (default=0)
2428
            The first iteration that will be shuffled.
2429
2430
        end_iteration : int, optional (default=-1)
            The last iteration that will be shuffled.
2431
            If <= 0, means the last available iteration.
2432

Nikita Titov's avatar
Nikita Titov committed
2433
2434
2435
2436
        Returns
        -------
        self : Booster
            Booster with shuffled models.
2437
        """
2438
2439
        _safe_call(_LIB.LGBM_BoosterShuffleModels(
            self.handle,
Guolin Ke's avatar
Guolin Ke committed
2440
2441
            ctypes.c_int(start_iteration),
            ctypes.c_int(end_iteration)))
Nikita Titov's avatar
Nikita Titov committed
2442
        return self
2443
2444
2445
2446
2447
2448

    def model_from_string(self, model_str, verbose=True):
        """Load Booster from a string.

        Parameters
        ----------
Nikita Titov's avatar
Nikita Titov committed
2449
        model_str : string
2450
            Model will be loaded from this string.
Nikita Titov's avatar
Nikita Titov committed
2451
2452
        verbose : bool, optional (default=True)
            Whether to print messages while loading model.
2453
2454
2455

        Returns
        -------
Nikita Titov's avatar
Nikita Titov committed
2456
        self : Booster
2457
2458
            Loaded Booster object.
        """
2459
2460
2461
2462
        if self.handle is not None:
            _safe_call(_LIB.LGBM_BoosterFree(self.handle))
        self._free_buffer()
        self.handle = ctypes.c_void_p()
2463
2464
2465
2466
2467
2468
2469
2470
2471
        out_num_iterations = ctypes.c_int(0)
        _safe_call(_LIB.LGBM_BoosterLoadModelFromString(
            c_str(model_str),
            ctypes.byref(out_num_iterations),
            ctypes.byref(self.handle)))
        out_num_class = ctypes.c_int(0)
        _safe_call(_LIB.LGBM_BoosterGetNumClasses(
            self.handle,
            ctypes.byref(out_num_class)))
2472
        if verbose:
Nikita Titov's avatar
Nikita Titov committed
2473
            print('Finished loading model, total used %d iterations' % int(out_num_iterations.value))
2474
        self.__num_class = out_num_class.value
2475
        self.pandas_categorical = _load_pandas_categorical(model_str=model_str)
2476
2477
2478
2479
        return self

    def model_to_string(self, num_iteration=None, start_iteration=0):
        """Save Booster to string.
2480

2481
2482
2483
2484
2485
2486
        Parameters
        ----------
        num_iteration : int or None, optional (default=None)
            Index of the iteration that should be saved.
            If None, if the best iteration exists, it is saved; otherwise, all iterations are saved.
            If <= 0, all iterations are saved.
Nikita Titov's avatar
Nikita Titov committed
2487
        start_iteration : int, optional (default=0)
2488
2489
2490
2491
            Start index of the iteration that should be saved.

        Returns
        -------
Nikita Titov's avatar
Nikita Titov committed
2492
        str_repr : string
2493
2494
            String representation of Booster.
        """
2495
        if num_iteration is None:
2496
2497
            num_iteration = self.best_iteration
        buffer_len = 1 << 20
2498
        tmp_out_len = ctypes.c_int64(0)
2499
2500
2501
2502
        string_buffer = ctypes.create_string_buffer(buffer_len)
        ptr_string_buffer = ctypes.c_char_p(*[ctypes.addressof(string_buffer)])
        _safe_call(_LIB.LGBM_BoosterSaveModelToString(
            self.handle,
2503
            ctypes.c_int(start_iteration),
2504
            ctypes.c_int(num_iteration),
2505
            ctypes.c_int64(buffer_len),
2506
2507
2508
            ctypes.byref(tmp_out_len),
            ptr_string_buffer))
        actual_len = tmp_out_len.value
2509
        # if buffer length is not long enough, re-allocate a buffer
2510
2511
2512
2513
2514
        if actual_len > buffer_len:
            string_buffer = ctypes.create_string_buffer(actual_len)
            ptr_string_buffer = ctypes.c_char_p(*[ctypes.addressof(string_buffer)])
            _safe_call(_LIB.LGBM_BoosterSaveModelToString(
                self.handle,
2515
                ctypes.c_int(start_iteration),
2516
                ctypes.c_int(num_iteration),
2517
                ctypes.c_int64(actual_len),
2518
2519
                ctypes.byref(tmp_out_len),
                ptr_string_buffer))
2520
2521
2522
        ret = string_buffer.value.decode()
        ret += _dump_pandas_categorical(self.pandas_categorical)
        return ret
2523

2524
    def dump_model(self, num_iteration=None, start_iteration=0):
Nikita Titov's avatar
Nikita Titov committed
2525
        """Dump Booster to JSON format.
wxchan's avatar
wxchan committed
2526

2527
2528
        Parameters
        ----------
2529
2530
2531
2532
        num_iteration : int or None, optional (default=None)
            Index of the iteration that should be dumped.
            If None, if the best iteration exists, it is dumped; otherwise, all iterations are dumped.
            If <= 0, all iterations are dumped.
Nikita Titov's avatar
Nikita Titov committed
2533
        start_iteration : int, optional (default=0)
2534
            Start index of the iteration that should be dumped.
2535

wxchan's avatar
wxchan committed
2536
2537
        Returns
        -------
2538
        json_repr : dict
Nikita Titov's avatar
Nikita Titov committed
2539
            JSON format of Booster.
wxchan's avatar
wxchan committed
2540
        """
2541
        if num_iteration is None:
2542
            num_iteration = self.best_iteration
wxchan's avatar
wxchan committed
2543
        buffer_len = 1 << 20
2544
        tmp_out_len = ctypes.c_int64(0)
wxchan's avatar
wxchan committed
2545
2546
2547
2548
        string_buffer = ctypes.create_string_buffer(buffer_len)
        ptr_string_buffer = ctypes.c_char_p(*[ctypes.addressof(string_buffer)])
        _safe_call(_LIB.LGBM_BoosterDumpModel(
            self.handle,
2549
            ctypes.c_int(start_iteration),
Guolin Ke's avatar
Guolin Ke committed
2550
            ctypes.c_int(num_iteration),
2551
            ctypes.c_int64(buffer_len),
wxchan's avatar
wxchan committed
2552
            ctypes.byref(tmp_out_len),
Guolin Ke's avatar
Guolin Ke committed
2553
            ptr_string_buffer))
wxchan's avatar
wxchan committed
2554
        actual_len = tmp_out_len.value
2555
        # if buffer length is not long enough, reallocate a buffer
wxchan's avatar
wxchan committed
2556
2557
2558
2559
2560
        if actual_len > buffer_len:
            string_buffer = ctypes.create_string_buffer(actual_len)
            ptr_string_buffer = ctypes.c_char_p(*[ctypes.addressof(string_buffer)])
            _safe_call(_LIB.LGBM_BoosterDumpModel(
                self.handle,
2561
                ctypes.c_int(start_iteration),
Guolin Ke's avatar
Guolin Ke committed
2562
                ctypes.c_int(num_iteration),
2563
                ctypes.c_int64(actual_len),
wxchan's avatar
wxchan committed
2564
                ctypes.byref(tmp_out_len),
Guolin Ke's avatar
Guolin Ke committed
2565
                ptr_string_buffer))
2566
2567
2568
2569
        ret = json.loads(string_buffer.value.decode())
        ret['pandas_categorical'] = json.loads(json.dumps(self.pandas_categorical,
                                                          default=json_default_with_numpy))
        return ret
wxchan's avatar
wxchan committed
2570

2571
2572
    def predict(self, data, num_iteration=None,
                raw_score=False, pred_leaf=False, pred_contrib=False,
2573
                data_has_header=False, is_reshape=True, **kwargs):
2574
        """Make a prediction.
wxchan's avatar
wxchan committed
2575
2576
2577

        Parameters
        ----------
2578
        data : string, numpy array, pandas DataFrame, H2O DataTable's Frame or scipy.sparse
2579
2580
            Data source for prediction.
            If string, it represents the path to txt file.
2581
2582
2583
2584
        num_iteration : int or None, optional (default=None)
            Limit number of iterations in the prediction.
            If None, if the best iteration exists, it is used; otherwise, all iterations are used.
            If <= 0, all iterations are used (no limits).
2585
2586
2587
2588
        raw_score : bool, optional (default=False)
            Whether to predict raw scores.
        pred_leaf : bool, optional (default=False)
            Whether to predict leaf index.
2589
2590
        pred_contrib : bool, optional (default=False)
            Whether to predict feature contributions.
2591

Nikita Titov's avatar
Nikita Titov committed
2592
2593
2594
2595
2596
2597
2598
            .. note::

                If you want to get more explanations for your model's predictions using SHAP values,
                like SHAP interaction values,
                you can install the shap package (https://github.com/slundberg/shap).
                Note that unlike the shap package, with ``pred_contrib`` we return a matrix with an extra
                column, where the last column is the expected value.
2599

2600
2601
2602
2603
2604
        data_has_header : bool, optional (default=False)
            Whether the data has header.
            Used only if data is string.
        is_reshape : bool, optional (default=True)
            If True, result is reshaped to [nrow, ncol].
2605
2606
        **kwargs
            Other parameters for the prediction.
wxchan's avatar
wxchan committed
2607
2608
2609

        Returns
        -------
2610
2611
        result : numpy array
            Prediction result.
wxchan's avatar
wxchan committed
2612
        """
2613
        predictor = self._to_predictor(copy.deepcopy(kwargs))
2614
        if num_iteration is None:
2615
            num_iteration = self.best_iteration
2616
2617
2618
        return predictor.predict(data, num_iteration,
                                 raw_score, pred_leaf, pred_contrib,
                                 data_has_header, is_reshape)
wxchan's avatar
wxchan committed
2619

2620
    def refit(self, data, label, decay_rate=0.9, **kwargs):
Guolin Ke's avatar
Guolin Ke committed
2621
2622
2623
2624
        """Refit the existing Booster by new data.

        Parameters
        ----------
2625
        data : string, numpy array, pandas DataFrame, H2O DataTable's Frame or scipy.sparse
Guolin Ke's avatar
Guolin Ke committed
2626
2627
            Data source for refit.
            If string, it represents the path to txt file.
2628
        label : list, numpy 1-D array or pandas Series / one-column DataFrame
Guolin Ke's avatar
Guolin Ke committed
2629
2630
            Label for refit.
        decay_rate : float, optional (default=0.9)
2631
2632
            Decay rate of refit,
            will use ``leaf_output = decay_rate * old_leaf_output + (1.0 - decay_rate) * new_leaf_output`` to refit trees.
2633
2634
        **kwargs
            Other parameters for refit.
2635
            These parameters will be passed to ``predict`` method.
Guolin Ke's avatar
Guolin Ke committed
2636
2637
2638
2639
2640
2641

        Returns
        -------
        result : Booster
            Refitted Booster.
        """
2642
2643
        if self.__set_objective_to_none:
            raise LightGBMError('Cannot refit due to null objective function.')
2644
        predictor = self._to_predictor(copy.deepcopy(kwargs))
2645
        leaf_preds = predictor.predict(data, -1, pred_leaf=True)
2646
        nrow, ncol = leaf_preds.shape
2647
        train_set = Dataset(data, label, silent=True)
2648
2649
        new_params = copy.deepcopy(self.params)
        new_params['refit_decay_rate'] = decay_rate
2650
        new_booster = Booster(new_params, train_set)
Guolin Ke's avatar
Guolin Ke committed
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
        # Copy models
        _safe_call(_LIB.LGBM_BoosterMerge(
            new_booster.handle,
            predictor.handle))
        leaf_preds = leaf_preds.reshape(-1)
        ptr_data, type_ptr_data, _ = c_int_array(leaf_preds)
        _safe_call(_LIB.LGBM_BoosterRefit(
            new_booster.handle,
            ptr_data,
            ctypes.c_int(nrow),
            ctypes.c_int(ncol)))
2662
2663
        new_booster.network = self.network
        new_booster.__attr = self.__attr.copy()
Guolin Ke's avatar
Guolin Ke committed
2664
2665
        return new_booster

2666
    def get_leaf_output(self, tree_id, leaf_id):
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
        """Get the output of a leaf.

        Parameters
        ----------
        tree_id : int
            The index of the tree.
        leaf_id : int
            The index of the leaf in the tree.

        Returns
        -------
        result : float
            The output of the leaf.
        """
2681
2682
2683
2684
2685
2686
2687
2688
        ret = ctypes.c_double(0)
        _safe_call(_LIB.LGBM_BoosterGetLeafValue(
            self.handle,
            ctypes.c_int(tree_id),
            ctypes.c_int(leaf_id),
            ctypes.byref(ret)))
        return ret.value

2689
    def _to_predictor(self, pred_parameter=None):
2690
        """Convert to predictor."""
2691
        predictor = _InnerPredictor(booster_handle=self.handle, pred_parameter=pred_parameter)
2692
        predictor.pandas_categorical = self.pandas_categorical
wxchan's avatar
wxchan committed
2693
2694
        return predictor

2695
    def num_feature(self):
2696
2697
2698
2699
2700
2701
2702
        """Get number of features.

        Returns
        -------
        num_feature : int
            The number of features.
        """
2703
2704
2705
2706
2707
2708
        out_num_feature = ctypes.c_int(0)
        _safe_call(_LIB.LGBM_BoosterGetNumFeature(
            self.handle,
            ctypes.byref(out_num_feature)))
        return out_num_feature.value

wxchan's avatar
wxchan committed
2709
    def feature_name(self):
2710
        """Get names of features.
wxchan's avatar
wxchan committed
2711
2712
2713

        Returns
        -------
2714
2715
        result : list
            List with names of features.
wxchan's avatar
wxchan committed
2716
        """
2717
        num_feature = self.num_feature()
2718
        # Get name of features
wxchan's avatar
wxchan committed
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
        tmp_out_len = ctypes.c_int(0)
        string_buffers = [ctypes.create_string_buffer(255) for i in range_(num_feature)]
        ptr_string_buffers = (ctypes.c_char_p * num_feature)(*map(ctypes.addressof, string_buffers))
        _safe_call(_LIB.LGBM_BoosterGetFeatureNames(
            self.handle,
            ctypes.byref(tmp_out_len),
            ptr_string_buffers))
        if num_feature != tmp_out_len.value:
            raise ValueError("Length of feature names doesn't equal with num_feature")
        return [string_buffers[i].value.decode() for i in range_(num_feature)]

2730
    def feature_importance(self, importance_type='split', iteration=None):
2731
        """Get feature importances.
2732

2733
2734
        Parameters
        ----------
2735
2736
2737
2738
        importance_type : string, optional (default="split")
            How the importance is calculated.
            If "split", result contains numbers of times the feature is used in a model.
            If "gain", result contains total gains of splits which use the feature.
2739
2740
2741
2742
        iteration : int or None, optional (default=None)
            Limit number of iterations in the feature importance calculation.
            If None, if the best iteration exists, it is used; otherwise, all trees are used.
            If <= 0, all trees are used (no limits).
2743

2744
2745
        Returns
        -------
2746
2747
        result : numpy array
            Array with feature importances.
2748
        """
2749
2750
        if iteration is None:
            iteration = self.best_iteration
2751
2752
2753
2754
2755
2756
        if importance_type == "split":
            importance_type_int = 0
        elif importance_type == "gain":
            importance_type_int = 1
        else:
            importance_type_int = -1
Nikita Titov's avatar
Nikita Titov committed
2757
        result = np.zeros(self.num_feature(), dtype=np.float64)
2758
2759
2760
2761
2762
2763
        _safe_call(_LIB.LGBM_BoosterFeatureImportance(
            self.handle,
            ctypes.c_int(iteration),
            ctypes.c_int(importance_type_int),
            result.ctypes.data_as(ctypes.POINTER(ctypes.c_double))))
        if importance_type_int == 0:
2764
            return result.astype(np.int32)
2765
2766
        else:
            return result
2767

2768
2769
2770
2771
2772
2773
2774
2775
2776
    def get_split_value_histogram(self, feature, bins=None, xgboost_style=False):
        """Get split value histogram for the specified feature.

        Parameters
        ----------
        feature : int or string
            The feature name or index the histogram is calculated for.
            If int, interpreted as index.
            If string, interpreted as name.
2777

Nikita Titov's avatar
Nikita Titov committed
2778
2779
2780
            .. warning::

                Categorical features are not supported.
2781

2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
        bins : int, string or None, optional (default=None)
            The maximum number of bins.
            If None, or int and > number of unique split values and ``xgboost_style=True``,
            the number of bins equals number of unique split values.
            If string, it should be one from the list of the supported values by ``numpy.histogram()`` function.
        xgboost_style : bool, optional (default=False)
            Whether the returned result should be in the same form as it is in XGBoost.
            If False, the returned value is tuple of 2 numpy arrays as it is in ``numpy.histogram()`` function.
            If True, the returned value is matrix, in which the first column is the right edges of non-empty bins
            and the second one is the histogram values.

        Returns
        -------
        result_tuple : tuple of 2 numpy arrays
            If ``xgboost_style=False``, the values of the histogram of used splitting values for the specified feature
            and the bin edges.
        result_array_like : numpy array or pandas DataFrame (if pandas is installed)
            If ``xgboost_style=True``, the histogram of used splitting values for the specified feature.
        """
        def add(root):
            """Recursively add thresholds."""
            if 'split_index' in root:  # non-leaf
                if feature_names is not None and isinstance(feature, string_type):
                    split_feature = feature_names[root['split_feature']]
                else:
                    split_feature = root['split_feature']
                if split_feature == feature:
2809
2810
2811
2812
                    if isinstance(root['threshold'], string_type):
                        raise LightGBMError('Cannot compute split value histogram for the categorical feature')
                    else:
                        values.append(root['threshold'])
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
                add(root['left_child'])
                add(root['right_child'])

        model = self.dump_model()
        feature_names = model.get('feature_names')
        tree_infos = model['tree_info']
        values = []
        for tree_info in tree_infos:
            add(tree_info['tree_structure'])

        if bins is None or isinstance(bins, integer_types) and xgboost_style:
            n_unique = len(np.unique(values))
            bins = max(min(n_unique, bins) if bins is not None else n_unique, 1)
        hist, bin_edges = np.histogram(values, bins=bins)
        if xgboost_style:
            ret = np.column_stack((bin_edges[1:], hist))
            ret = ret[ret[:, 1] > 0]
            if PANDAS_INSTALLED:
                return DataFrame(ret, columns=['SplitValue', 'Count'])
            else:
                return ret
        else:
            return hist, bin_edges

wxchan's avatar
wxchan committed
2837
    def __inner_eval(self, data_name, data_idx, feval=None):
2838
        """Evaluate training or validation data."""
wxchan's avatar
wxchan committed
2839
        if data_idx >= self.__num_dataset:
2840
            raise ValueError("Data_idx should be smaller than number of dataset")
wxchan's avatar
wxchan committed
2841
2842
2843
        self.__get_eval_info()
        ret = []
        if self.__num_inner_eval > 0:
2844
            result = np.zeros(self.__num_inner_eval, dtype=np.float64)
Guolin Ke's avatar
Guolin Ke committed
2845
            tmp_out_len = ctypes.c_int(0)
wxchan's avatar
wxchan committed
2846
2847
            _safe_call(_LIB.LGBM_BoosterGetEval(
                self.handle,
Guolin Ke's avatar
Guolin Ke committed
2848
                ctypes.c_int(data_idx),
wxchan's avatar
wxchan committed
2849
                ctypes.byref(tmp_out_len),
Guolin Ke's avatar
Guolin Ke committed
2850
                result.ctypes.data_as(ctypes.POINTER(ctypes.c_double))))
wxchan's avatar
wxchan committed
2851
            if tmp_out_len.value != self.__num_inner_eval:
2852
                raise ValueError("Wrong length of eval results")
wxchan's avatar
wxchan committed
2853
            for i in range_(self.__num_inner_eval):
2854
2855
                ret.append((data_name, self.__name_inner_eval[i],
                            result[i], self.__higher_better_inner_eval[i]))
wxchan's avatar
wxchan committed
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
        if feval is not None:
            if data_idx == 0:
                cur_data = self.train_set
            else:
                cur_data = self.valid_sets[data_idx - 1]
            feval_ret = feval(self.__inner_predict(data_idx), cur_data)
            if isinstance(feval_ret, list):
                for eval_name, val, is_higher_better in feval_ret:
                    ret.append((data_name, eval_name, val, is_higher_better))
            else:
                eval_name, val, is_higher_better = feval_ret
                ret.append((data_name, eval_name, val, is_higher_better))
        return ret

    def __inner_predict(self, data_idx):
2871
        """Predict for training and validation dataset."""
wxchan's avatar
wxchan committed
2872
        if data_idx >= self.__num_dataset:
2873
            raise ValueError("Data_idx should be smaller than number of dataset")
wxchan's avatar
wxchan committed
2874
2875
2876
2877
2878
        if self.__inner_predict_buffer[data_idx] is None:
            if data_idx == 0:
                n_preds = self.train_set.num_data() * self.__num_class
            else:
                n_preds = self.valid_sets[data_idx - 1].num_data() * self.__num_class
2879
            self.__inner_predict_buffer[data_idx] = np.zeros(n_preds, dtype=np.float64)
2880
        # avoid to predict many time in one iteration
wxchan's avatar
wxchan committed
2881
2882
        if not self.__is_predicted_cur_iter[data_idx]:
            tmp_out_len = ctypes.c_int64(0)
Guolin Ke's avatar
Guolin Ke committed
2883
            data_ptr = self.__inner_predict_buffer[data_idx].ctypes.data_as(ctypes.POINTER(ctypes.c_double))
wxchan's avatar
wxchan committed
2884
2885
            _safe_call(_LIB.LGBM_BoosterGetPredict(
                self.handle,
Guolin Ke's avatar
Guolin Ke committed
2886
                ctypes.c_int(data_idx),
wxchan's avatar
wxchan committed
2887
2888
2889
                ctypes.byref(tmp_out_len),
                data_ptr))
            if tmp_out_len.value != len(self.__inner_predict_buffer[data_idx]):
2890
                raise ValueError("Wrong length of predict results for data %d" % (data_idx))
wxchan's avatar
wxchan committed
2891
2892
2893
2894
            self.__is_predicted_cur_iter[data_idx] = True
        return self.__inner_predict_buffer[data_idx]

    def __get_eval_info(self):
2895
        """Get inner evaluation count and names."""
wxchan's avatar
wxchan committed
2896
2897
        if self.__need_reload_eval_info:
            self.__need_reload_eval_info = False
Guolin Ke's avatar
Guolin Ke committed
2898
            out_num_eval = ctypes.c_int(0)
2899
            # Get num of inner evals
wxchan's avatar
wxchan committed
2900
2901
2902
2903
2904
            _safe_call(_LIB.LGBM_BoosterGetEvalCounts(
                self.handle,
                ctypes.byref(out_num_eval)))
            self.__num_inner_eval = out_num_eval.value
            if self.__num_inner_eval > 0:
2905
                # Get name of evals
Guolin Ke's avatar
Guolin Ke committed
2906
                tmp_out_len = ctypes.c_int(0)
wxchan's avatar
wxchan committed
2907
                string_buffers = [ctypes.create_string_buffer(255) for i in range_(self.__num_inner_eval)]
wxchan's avatar
wxchan committed
2908
                ptr_string_buffers = (ctypes.c_char_p * self.__num_inner_eval)(*map(ctypes.addressof, string_buffers))
wxchan's avatar
wxchan committed
2909
2910
2911
2912
2913
                _safe_call(_LIB.LGBM_BoosterGetEvalNames(
                    self.handle,
                    ctypes.byref(tmp_out_len),
                    ptr_string_buffers))
                if self.__num_inner_eval != tmp_out_len.value:
2914
                    raise ValueError("Length of eval names doesn't equal with num_evals")
2915
                self.__name_inner_eval = \
wxchan's avatar
wxchan committed
2916
                    [string_buffers[i].value.decode() for i in range_(self.__num_inner_eval)]
2917
                self.__higher_better_inner_eval = \
2918
                    [name.startswith(('auc', 'ndcg@', 'map@')) for name in self.__name_inner_eval]
2919

wxchan's avatar
wxchan committed
2920
    def attr(self, key):
2921
        """Get attribute string from the Booster.
wxchan's avatar
wxchan committed
2922
2923
2924

        Parameters
        ----------
2925
2926
        key : string
            The name of the attribute.
wxchan's avatar
wxchan committed
2927
2928
2929

        Returns
        -------
2930
2931
        value : string or None
            The attribute value.
Nikita Titov's avatar
Nikita Titov committed
2932
            Returns None if attribute does not exist.
wxchan's avatar
wxchan committed
2933
        """
2934
        return self.__attr.get(key, None)
wxchan's avatar
wxchan committed
2935
2936

    def set_attr(self, **kwargs):
2937
        """Set attributes to the Booster.
wxchan's avatar
wxchan committed
2938
2939
2940
2941

        Parameters
        ----------
        **kwargs
2942
2943
            The attributes to set.
            Setting a value to None deletes an attribute.
Nikita Titov's avatar
Nikita Titov committed
2944
2945
2946
2947

        Returns
        -------
        self : Booster
2948
            Booster with set attributes.
wxchan's avatar
wxchan committed
2949
2950
2951
        """
        for key, value in kwargs.items():
            if value is not None:
wxchan's avatar
wxchan committed
2952
                if not isinstance(value, string_type):
Nikita Titov's avatar
Nikita Titov committed
2953
                    raise ValueError("Only string values are accepted")
wxchan's avatar
wxchan committed
2954
2955
2956
                self.__attr[key] = value
            else:
                self.__attr.pop(key, None)
Nikita Titov's avatar
Nikita Titov committed
2957
        return self