Commit 9f4849b3 authored by Guolin Ke's avatar Guolin Ke Committed by GitHub
Browse files

Merge pull request #97 from wxchan/dev

Clean codes for python-package; dump model to JSON
parents fa51a676 69114525
...@@ -17,7 +17,7 @@ For more details, please refer to [Features](https://github.com/Microsoft/LightG ...@@ -17,7 +17,7 @@ For more details, please refer to [Features](https://github.com/Microsoft/LightG
News News
---- ----
12/02/2012 : Release [python-package](https://github.com/Microsoft/LightGBM/tree/master/python-package) beta version, welcome to have a try and provide issues and feedback. 12/02/2016 : Release [python-package](https://github.com/Microsoft/LightGBM/tree/master/python-package) beta version, welcome to have a try and provide issues and feedback.
Get Started Get Started
------------ ------------
......
Python Package Example
=====================
Here is an example for LightGBM to use python package.
***You should install lightgbm (both c++ and python verion) first.***
For the installation, check the wiki [here](https://github.com/Microsoft/LightGBM/wiki/Installation-Guide).
You also need scikit-learn and pandas to run the examples, but they are not required for the package itself. You can install them with pip:
```
pip install -U scikit-learn
pip install -U pandas
```
Now you can run examples in this folder, for example:
```
python simple_example.py
```
import numpy as np # coding: utf-8
import random # pylint: disable = invalid-name, C0111
import json
import lightgbm as lgb import lightgbm as lgb
from sklearn import datasets, metrics, model_selection import pandas as pd
from sklearn.metrics import mean_squared_error
rng = np.random.RandomState(2016)
# load or create your dataset
X, y = datasets.make_classification(n_samples=10000, n_features=100) df_train = pd.read_csv('../regression/regression.train', header=None, sep='\t')
x_train, x_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.1, random_state=1) df_test = pd.read_csv('../regression/regression.test', header=None, sep='\t')
lgb_model = lgb.LGBMClassifier(n_estimators=100).fit(x_train, y_train, [(x_test, y_test)], eval_metric="auc")
lgb_model.predict(x_test) y_train = df_train[0]
# save model y_test = df_test[0]
lgb_model.booster().save_model('model.txt') X_train = df_train.drop(0, axis=1)
# load model X_test = df_test.drop(0, axis=1)
booster = lgb.Booster(model_file='model.txt')
# create dataset for lightgbm
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
# or you can simply use a tuple of length=2 here
lgb_train = (X_train, y_train)
lgb_eval = (X_test, y_test)
# specify your configurations as a dict
params = {
'task' : 'train',
'boosting_type' : 'gbdt',
'objective' : 'regression',
'metric' : 'l2',
'num_leaves' : 31,
'learning_rate' : 0.05,
'feature_fraction' : 0.9,
'bagging_fraction' : 0.8,
'bagging_freq': 5,
# 'ndcg_eval_at' : [1, 3, 5, 10],
# this metric is not needed in this task, show as an example
'verbose' : 0
}
# train
gbm = lgb.train(params,
lgb_train,
num_boost_round=100,
valid_datas=lgb_eval,
# you can use a list to represent multiple valid_datas/valid_names
# don't use tuple, tuple is used to represent one dataset
early_stopping_rounds=10)
# save model to file
gbm.save_model('model.txt')
# load model from file
gbm = lgb.Booster(model_file='model.txt')
# predict # predict
print(booster.predict(x_test)) y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
# eval
print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)
# dump model to json (and save to file)
model_json = gbm.dump_model()
with open('model.json', 'w+') as f:
json.dump(model_json, f, indent=4)
# coding: utf-8
# pylint: disable = invalid-name, C0111
import lightgbm as lgb
import pandas as pd
from sklearn.metrics import mean_squared_error
# load or create your dataset
df_train = pd.read_csv('../regression/regression.train', header=None, sep='\t')
df_test = pd.read_csv('../regression/regression.test', header=None, sep='\t')
y_train = df_train[0]
y_test = df_test[0]
X_train = df_train.drop(0, axis=1)
X_test = df_test.drop(0, axis=1)
# train
gbm = lgb.LGBMRegressor(objective='regression',
num_leaves=31,
learning_rate=0.05,
n_estimators=100)
gbm.fit(X_train, y_train,
eval_set=[(X_test, y_test)],
early_stopping_rounds=10)
# predict
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
# eval
print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)
...@@ -133,9 +133,16 @@ public: ...@@ -133,9 +133,16 @@ public:
const double* feature_values) const = 0; const double* feature_values) const = 0;
/*! /*!
* \brief save model to file * \brief Dump model to json format string
* \param num_iterations Iterations that want to save, -1 means save all * \return Json format string of model
* \param filename filename that want to save to */
virtual std::string DumpModel() const = 0;
/*!
* \brief Save model to file
* \param num_used_model Number of model that want to save, -1 means save all
* \param is_finish Is training finished or not
* \param filename Filename that want to save to
*/ */
virtual void SaveModelToFile(int num_iterations, const char* filename) const = 0; virtual void SaveModelToFile(int num_iterations, const char* filename) const = 0;
......
...@@ -474,7 +474,18 @@ DllExport int LGBM_BoosterSaveModel(BoosterHandle handle, ...@@ -474,7 +474,18 @@ DllExport int LGBM_BoosterSaveModel(BoosterHandle handle,
int num_iteration, int num_iteration,
const char* filename); const char* filename);
/*!
* \brief dump model to json
* \param handle handle
* \param buffer_len string buffer length, if buffer_len < out_len, re-allocate buffer
* \param out_len actual output length
* \param out_str json format string of model
* \return 0 when succeed, -1 when failure happens
*/
DllExport int LGBM_BoosterDumpModel(BoosterHandle handle,
int buffer_len,
int64_t* out_len,
char** out_str);
// some help functions used to convert data // some help functions used to convert data
......
...@@ -98,9 +98,12 @@ public: ...@@ -98,9 +98,12 @@ public:
} }
} }
/*! \brief Serialize this object by string*/ /*! \brief Serialize this object to string*/
std::string ToString(); std::string ToString();
/*! \brief Serialize this object to json*/
std::string ToJSON();
private: private:
/*! /*!
* \brief Find leaf index of which record belongs by data * \brief Find leaf index of which record belongs by data
...@@ -118,6 +121,9 @@ private: ...@@ -118,6 +121,9 @@ private:
*/ */
inline int GetLeaf(const double* feature_values) const; inline int GetLeaf(const double* feature_values) const;
/*! \brief Serialize one node to json*/
inline std::string NodeToJSON(int index);
/*! \brief Number of max leaves*/ /*! \brief Number of max leaves*/
int max_leaves_; int max_leaves_;
/*! \brief Number of current levas*/ /*! \brief Number of current levas*/
...@@ -137,13 +143,13 @@ private: ...@@ -137,13 +143,13 @@ private:
std::vector<double> threshold_; std::vector<double> threshold_;
/*! \brief A non-leaf node's split gain */ /*! \brief A non-leaf node's split gain */
std::vector<double> split_gain_; std::vector<double> split_gain_;
/*! \brief Output of internal nodes(save internal output for per inference feature importance calc) */
std::vector<double> internal_value_;
// used for leaf node // used for leaf node
/*! \brief The parent of leaf */ /*! \brief The parent of leaf */
std::vector<int> leaf_parent_; std::vector<int> leaf_parent_;
/*! \brief Output of leaves */ /*! \brief Output of leaves */
std::vector<double> leaf_value_; std::vector<double> leaf_value_;
/*! \brief Output of internal nodes(save internal output for per inference feature importance calc) */
std::vector<double> internal_value_;
/*! \brief Depth for leaves */ /*! \brief Depth for leaves */
std::vector<int> leaf_depth_; std::vector<int> leaf_depth_;
}; };
......
...@@ -20,4 +20,5 @@ __version__ = 0.1 ...@@ -20,4 +20,5 @@ __version__ = 0.1
__all__ = ['Dataset', 'Booster', __all__ = ['Dataset', 'Booster',
'train', 'cv', 'train', 'cv',
'LGBMModel','LGBMRegressor', 'LGBMClassifier', 'LGBMRanker'] 'LGBMModel', 'LGBMRegressor', 'LGBMClassifier', 'LGBMRanker']
\ No newline at end of file
# coding: utf-8
# pylint: disable = invalid-name, C0111, R0912, R0913, R0914, W0105
"""Wrapper c_api of LightGBM""" """Wrapper c_api of LightGBM"""
from __future__ import absolute_import from __future__ import absolute_import
...@@ -5,12 +7,24 @@ import sys ...@@ -5,12 +7,24 @@ import sys
import os import os
import ctypes import ctypes
import tempfile import tempfile
import json
import numpy as np import numpy as np
import scipy.sparse import scipy.sparse
from .libpath import find_lib_path from .libpath import find_lib_path
# pandas
try:
from pandas import Series, DataFrame
IS_PANDAS_INSTALLED = True
except ImportError:
IS_PANDAS_INSTALLED = False
class Series(object):
pass
class DataFrame(object):
pass
IS_PY3 = (sys.version_info[0] == 3) IS_PY3 = (sys.version_info[0] == 3)
def _load_lib(): def _load_lib():
...@@ -69,6 +83,8 @@ def list_to_1d_numpy(data, dtype): ...@@ -69,6 +83,8 @@ def list_to_1d_numpy(data, dtype):
return data.astype(dtype=dtype, copy=False) return data.astype(dtype=dtype, copy=False)
elif is_1d_list(data): elif is_1d_list(data):
return np.array(data, dtype=dtype, copy=False) return np.array(data, dtype=dtype, copy=False)
elif IS_PANDAS_INSTALLED and isinstance(data, Series):
return data.astype(dtype).values
else: else:
raise TypeError("Unknow type({})".format(type(data).__name__)) raise TypeError("Unknow type({})".format(type(data).__name__))
...@@ -110,7 +126,7 @@ def param_dict_to_str(data): ...@@ -110,7 +126,7 @@ def param_dict_to_str(data):
elif isinstance(val, (int, float, bool)): elif isinstance(val, (int, float, bool)):
pairs.append(str(key)+'='+str(val)) pairs.append(str(key)+'='+str(val))
else: else:
raise TypeError('unknow type of parameter:%s , got:%s' raise TypeError('unknow type of parameter:%s , got:%s'
% (key, type(val).__name__)) % (key, type(val).__name__))
return ' '.join(pairs) return ' '.join(pairs)
"""marco definition of data type in c_api of LightGBM""" """marco definition of data type in c_api of LightGBM"""
...@@ -183,7 +199,7 @@ class Predictor(object): ...@@ -183,7 +199,7 @@ class Predictor(object):
"""Prediction task""" """Prediction task"""
out_num_iterations = ctypes.c_int64(0) out_num_iterations = ctypes.c_int64(0)
_safe_call(_LIB.LGBM_BoosterCreateFromModelfile( _safe_call(_LIB.LGBM_BoosterCreateFromModelfile(
c_str(model_file), c_str(model_file),
ctypes.byref(out_num_iterations), ctypes.byref(out_num_iterations),
ctypes.byref(self.handle))) ctypes.byref(self.handle)))
out_num_class = ctypes.c_int64(0) out_num_class = ctypes.c_int64(0)
...@@ -357,7 +373,7 @@ class Predictor(object): ...@@ -357,7 +373,7 @@ class Predictor(object):
type_ptr_data, type_ptr_data,
len(csr.indptr), len(csr.indptr),
len(csr.data), len(csr.data),
csr.shape[1], csr.shape[1],
predict_type, predict_type,
num_iteration, num_iteration,
ctypes.byref(out_num_preds), ctypes.byref(out_num_preds),
...@@ -367,13 +383,6 @@ class Predictor(object): ...@@ -367,13 +383,6 @@ class Predictor(object):
raise ValueError("incorrect number for predict result") raise ValueError("incorrect number for predict result")
return preds, nrow return preds, nrow
# pandas
try:
from pandas import DataFrame
except ImportError:
class DataFrame(object):
pass
PANDAS_DTYPE_MAPPER = {'int8': 'int', 'int16': 'int', 'int32': 'int', PANDAS_DTYPE_MAPPER = {'int8': 'int', 'int16': 'int', 'int32': 'int',
'int64': 'int', 'uint8': 'int', 'uint16': 'int', 'int64': 'int', 'uint8': 'int', 'uint16': 'int',
'uint32': 'int', 'uint64': 'int', 'float16': 'float', 'uint32': 'int', 'uint64': 'int', 'float16': 'float',
...@@ -467,8 +476,8 @@ class Dataset(object): ...@@ -467,8 +476,8 @@ class Dataset(object):
self.data_has_header = True self.data_has_header = True
self.handle = ctypes.c_void_p() self.handle = ctypes.c_void_p()
_safe_call(_LIB.LGBM_DatasetCreateFromFile( _safe_call(_LIB.LGBM_DatasetCreateFromFile(
c_str(data), c_str(data),
c_str(params_str), c_str(params_str),
ref_dataset, ref_dataset,
ctypes.byref(self.handle))) ctypes.byref(self.handle)))
elif isinstance(data, scipy.sparse.csr_matrix): elif isinstance(data, scipy.sparse.csr_matrix):
...@@ -830,6 +839,7 @@ class Booster(object): ...@@ -830,6 +839,7 @@ class Booster(object):
self.__is_manage_handle = True self.__is_manage_handle = True
self.__train_data_name = "training" self.__train_data_name = "training"
self.__attr = {} self.__attr = {}
self.best_iteration = -1
params = {} if params is None else params params = {} if params is None else params
if silent: if silent:
params["verbose"] = 0 params["verbose"] = 0
...@@ -1018,7 +1028,7 @@ class Booster(object): ...@@ -1018,7 +1028,7 @@ class Booster(object):
self.handle, self.handle,
ctypes.byref(out_cur_iter))) ctypes.byref(out_cur_iter)))
return out_cur_iter.value return out_cur_iter.value
def eval(self, data, name, feval=None): def eval(self, data, name, feval=None):
"""Evaluate for data """Evaluate for data
...@@ -1098,6 +1108,34 @@ class Booster(object): ...@@ -1098,6 +1108,34 @@ class Booster(object):
num_iteration, num_iteration,
c_str(filename))) c_str(filename)))
def dump_model(self):
"""
Dump model to json format
Returns
-------
Json format of model
"""
buffer_len = 1 << 20
tmp_out_len = ctypes.c_int64(0)
string_buffer = ctypes.create_string_buffer(buffer_len)
ptr_string_buffer = ctypes.c_char_p(*[ctypes.addressof(string_buffer)])
_safe_call(_LIB.LGBM_BoosterDumpModel(
self.handle,
buffer_len,
ctypes.byref(tmp_out_len),
ctypes.byref(ptr_string_buffer)))
actual_len = tmp_out_len.value
if actual_len > buffer_len:
string_buffer = ctypes.create_string_buffer(actual_len)
ptr_string_buffer = ctypes.c_char_p(*[ctypes.addressof(string_buffer)])
_safe_call(_LIB.LGBM_BoosterDumpModel(
self.handle,
actual_len,
ctypes.byref(tmp_out_len),
ctypes.byref(ptr_string_buffer)))
return json.loads(string_buffer.value.decode())
def predict(self, data, num_iteration=-1, raw_score=False, pred_leaf=False, data_has_header=False, is_reshape=True): def predict(self, data, num_iteration=-1, raw_score=False, pred_leaf=False, data_has_header=False, is_reshape=True):
""" """
Predict logic Predict logic
...@@ -1147,7 +1185,7 @@ class Booster(object): ...@@ -1147,7 +1185,7 @@ class Booster(object):
_safe_call(_LIB.LGBM_BoosterGetEval( _safe_call(_LIB.LGBM_BoosterGetEval(
self.handle, self.handle,
data_idx, data_idx,
ctypes.byref(tmp_out_len), ctypes.byref(tmp_out_len),
result.ctypes.data_as(ctypes.POINTER(ctypes.c_float)))) result.ctypes.data_as(ctypes.POINTER(ctypes.c_float))))
if tmp_out_len.value != self.__num_inner_eval: if tmp_out_len.value != self.__num_inner_eval:
raise ValueError("incorrect number of eval results") raise ValueError("incorrect number of eval results")
...@@ -1190,7 +1228,7 @@ class Booster(object): ...@@ -1190,7 +1228,7 @@ class Booster(object):
ctypes.byref(tmp_out_len), ctypes.byref(tmp_out_len),
data_ptr)) data_ptr))
if tmp_out_len.value != len(self.__inner_predict_buffer[data_idx]): if tmp_out_len.value != len(self.__inner_predict_buffer[data_idx]):
raise ValueError("incorrect number of predict results for data %d" % (data_idx) ) raise ValueError("incorrect number of predict results for data %d" % (data_idx))
self.__is_predicted_cur_iter[data_idx] = True self.__is_predicted_cur_iter[data_idx] = True
return self.__inner_predict_buffer[data_idx] return self.__inner_predict_buffer[data_idx]
......
# coding: utf-8
# pylint: disable = invalid-name, W0105
from __future__ import absolute_import from __future__ import absolute_import
import collections import collections
...@@ -25,12 +27,12 @@ CallbackEnv = collections.namedtuple( ...@@ -25,12 +27,12 @@ CallbackEnv = collections.namedtuple(
def _format_eval_result(value, show_stdv=True): def _format_eval_result(value, show_stdv=True):
"""format metric string""" """format metric string"""
if len(value) == 4: if len(value) == 4:
return '%s_%s:%g' % (value[0], value[1], value[2]) return '%s\'s %s:%g' % (value[0], value[1], value[2])
elif len(value) == 5: elif len(value) == 5:
if show_stdv: if show_stdv:
return '%s_%s:%g+%g' % (value[0], value[1], value[2], value[4]) return '%s\'s %s:%g+%g' % (value[0], value[1], value[2], value[4])
else: else:
return '%s_%s:%g' % (value[0], value[1], value[2]) return '%s\'s %s:%g' % (value[0], value[1], value[2])
else: else:
raise ValueError("wrong metric value") raise ValueError("wrong metric value")
...@@ -55,9 +57,10 @@ def print_evaluation(period=1, show_stdv=True): ...@@ -55,9 +57,10 @@ def print_evaluation(period=1, show_stdv=True):
"""internal function""" """internal function"""
if len(env.evaluation_result_list) == 0 or period is False: if len(env.evaluation_result_list) == 0 or period is False:
return return
if (env.iteration % period == 0 or env.iteration + 1 == env.begin_iteration): if env.iteration % period == 0 or env.iteration + 1 == env.begin_iteration:
result = '\t'.join([_format_eval_result(x, show_stdv) for x in env.evaluation_result_list]) result = '\t'.join([_format_eval_result(x, show_stdv) \
print('[%d]\t%s\n' % (env.iteration, result)) for x in env.evaluation_result_list])
print('[%d]\t%s' % (env.iteration, result))
return callback return callback
...@@ -131,13 +134,13 @@ def reset_learning_rate(learning_rates): ...@@ -131,13 +134,13 @@ def reset_learning_rate(learning_rates):
def early_stop(stopping_rounds, verbose=True): def early_stop(stopping_rounds, verbose=True):
"""Create a callback that activates early stopping. """Create a callback that activates early stopping.
Activates early stopping. Activates early stopping.
Requires at least one validation data and one metric Requires at least one validation data and one metric
If there's more than one, will check all of them If there's more than one, will check all of them
Parameters Parameters
---------- ----------
stopp_rounds : int stopping_rounds : int
The stopping rounds before the trend occur. The stopping rounds before the trend occur.
verbose : optional, bool verbose : optional, bool
...@@ -154,13 +157,11 @@ def early_stop(stopping_rounds, verbose=True): ...@@ -154,13 +157,11 @@ def early_stop(stopping_rounds, verbose=True):
best_msg = {} best_msg = {}
def init(env): def init(env):
"""internal function""" """internal function"""
bst = env.model
if len(env.evaluation_result_list) == 0: if len(env.evaluation_result_list) == 0:
raise ValueError('For early stopping you need at least one set in evals.') raise ValueError('For early stopping you need at least one set in evals.')
if verbose: if verbose:
msg = "Will train until hasn't improved in {} rounds.\n" msg = "Train until valid scores didn't improve in {} rounds."
print(msg.format(stopping_rounds)) print(msg.format(stopping_rounds))
for i in range(len(env.evaluation_result_list)): for i in range(len(env.evaluation_result_list)):
...@@ -182,13 +183,13 @@ def early_stop(stopping_rounds, verbose=True): ...@@ -182,13 +183,13 @@ def early_stop(stopping_rounds, verbose=True):
best_score[i] = score best_score[i] = score
best_iter[i] = env.iteration best_iter[i] = env.iteration
if verbose: if verbose:
best_msg[i] = '[%d]\t%s' % ( env.iteration, best_msg[i] = '[%d]\t%s' % (env.iteration, \
'\t'.join([_format_eval_result(x) for x in env.evaluation_result_list])) '\t'.join([_format_eval_result(x) for x in env.evaluation_result_list]))
else: else:
if env.iteration - best_iter[i] >= stopping_rounds: if env.iteration - best_iter[i] >= stopping_rounds:
if env.model is not None: if env.model is not None:
env.model.set_attr(best_iteration=str(best_iter[i])) env.model.set_attr(best_iteration=str(best_iter[i]))
if verbose: if verbose:
print('early stopping, best message is:\n {} '.format(best_msg[i])) print('early stopping, best iteration is:\n{}'.format(best_msg[i]))
raise EarlyStopException(best_iter[i]) raise EarlyStopException(best_iter[i])
return callback return callback
# coding: utf-8
# pylint: disable = invalid-name, W0105
"""Training Library containing training routines of LightGBM.""" """Training Library containing training routines of LightGBM."""
from __future__ import absolute_import from __future__ import absolute_import
...@@ -6,7 +8,7 @@ from .basic import LightGBMError, Predictor, Dataset, Booster, is_str ...@@ -6,7 +8,7 @@ from .basic import LightGBMError, Predictor, Dataset, Booster, is_str
from . import callback from . import callback
def _construct_dataset(X_y, reference=None, def _construct_dataset(X_y, reference=None,
params=None, other_fields=None, params=None, other_fields=None,
predictor=None): predictor=None):
if 'max_bin' in params: if 'max_bin' in params:
max_bin = int(params['max_bin']) max_bin = int(params['max_bin'])
...@@ -30,10 +32,9 @@ def _construct_dataset(X_y, reference=None, ...@@ -30,10 +32,9 @@ def _construct_dataset(X_y, reference=None,
data = X_y[0] data = X_y[0]
label = X_y[1] label = X_y[1]
if reference is None: if reference is None:
ret = Dataset(data, label=label, max_bin=max_bin, ret = Dataset(data, label=label, max_bin=max_bin,
weight=weight, group=group, weight=weight, group=group,
predictor=predictor, params=params) predictor=predictor, params=params)
else: else:
ret = reference.create_valid(data, label=label, weight=weight, ret = reference.create_valid(data, label=label, weight=weight,
group=group, params=params) group=group, params=params)
...@@ -53,11 +54,11 @@ def train(params, train_data, num_boost_round=100, ...@@ -53,11 +54,11 @@ def train(params, train_data, num_boost_round=100,
---------- ----------
params : dict params : dict
params. params.
train_data : pair, (X, y) or filename of data train_data : Dataset, tuple (X, y) or filename of data
Data to be trained. Data to be trained.
num_boost_round: int num_boost_round: int
Number of boosting iterations. Number of boosting iterations.
valid_datas: list of pairs (valid_X, valid_y) or filename of data valid_datas: list of Datasets, tuples (valid_X, valid_y) or filename of data
List of data to be evaluated during training List of data to be evaluated during training
valid_names: list of string valid_names: list of string
names of valid_datas names of valid_datas
...@@ -72,18 +73,19 @@ def train(params, train_data, num_boost_round=100, ...@@ -72,18 +73,19 @@ def train(params, train_data, num_boost_round=100,
other data file in training data. e.g. train_fields['weight'] is weight data other data file in training data. e.g. train_fields['weight'] is weight data
support fields: weight, group, init_score support fields: weight, group, init_score
valid_fields : dict valid_fields : dict
other data file in training data. e.g. valid_fields[0]['weight'] is weight data for first valid data other data file in training data. \
e.g. valid_fields[0]['weight'] is weight data for first valid data
support fields: weight, group, init_score support fields: weight, group, init_score
early_stopping_rounds: int early_stopping_rounds: int
Activates early stopping. Activates early stopping.
Requires at least one validation data and one metric Requires at least one validation data and one metric
If there's more than one, will check all of them If there's more than one, will check all of them
Returns the model with (best_iter + early_stopping_rounds) Returns the model with (best_iter + early_stopping_rounds)
If early stopping occurs, the model will add 'best_iteration' field If early stopping occurs, the model will add 'best_iteration' field
evals_result: dict or None evals_result: dict or None
This dictionary used to store all evaluation results of all the items in valid_datas. This dictionary used to store all evaluation results of all the items in valid_datas.
Example: with a valid_datas containing [valid_set, train_set] and valid_names containing ['eval', 'train'] and Example: with a valid_datas containing [valid_set, train_set] \
a paramater containing ('metric':'logloss') and valid_names containing ['eval', 'train'] and a paramater containing ('metric':'logloss')
Returns: {'train': {'logloss': ['0.48253', '0.35953', ...]}, Returns: {'train': {'logloss': ['0.48253', '0.35953', ...]},
'eval': {'logloss': ['0.480385', '0.357756', ...]}} 'eval': {'logloss': ['0.480385', '0.357756', ...]}}
passed with None means no using this function passed with None means no using this function
...@@ -120,26 +122,36 @@ def train(params, train_data, num_boost_round=100, ...@@ -120,26 +122,36 @@ def train(params, train_data, num_boost_round=100,
else: else:
predictor = None predictor = None
"""create dataset""" """create dataset"""
train_set = _construct_dataset(train_data, None, params, train_fields, predictor) if isinstance(train_data, Dataset):
train_set = train_data
else:
train_set = _construct_dataset(train_data, None, params, train_fields, predictor)
is_valid_contain_train = False is_valid_contain_train = False
train_data_name = "training" train_data_name = "training"
valid_sets = [] valid_sets = []
name_valid_sets = [] name_valid_sets = []
if valid_datas is not None: if valid_datas is not None:
for i in range(len(valid_datas)): if isinstance(valid_datas, (Dataset, tuple)):
valid_datas = [valid_datas]
if isinstance(valid_names, str):
valid_names = [valid_names]
for i, valid_data in enumerate(valid_datas):
other_fields = None if valid_fields is None else valid_fields[i] other_fields = None if valid_fields is None else valid_fields[i]
"""reduce cost for prediction training data""" """reduce cost for prediction training data"""
if valid_datas[i] is train_data: if valid_data is train_data:
is_valid_contain_train = True is_valid_contain_train = True
if valid_names is not None: if valid_names is not None:
train_data_name = valid_names[i] train_data_name = valid_names[i]
continue continue
valid_set = _construct_dataset( if isinstance(valid_data, Dataset):
valid_datas[i], valid_set = valid_data
train_set, else:
params, valid_set = _construct_dataset(
other_fields, valid_data,
predictor) train_set,
params,
other_fields,
predictor)
valid_sets.append(valid_set) valid_sets.append(valid_set)
if valid_names is not None: if valid_names is not None:
name_valid_sets.append(valid_names[i]) name_valid_sets.append(valid_names[i])
...@@ -178,8 +190,8 @@ def train(params, train_data, num_boost_round=100, ...@@ -178,8 +190,8 @@ def train(params, train_data, num_boost_round=100,
booster = Booster(params=params, train_set=train_set) booster = Booster(params=params, train_set=train_set)
if is_valid_contain_train: if is_valid_contain_train:
booster.set_train_data_name(train_data_name) booster.set_train_data_name(train_data_name)
for i in range(len(valid_sets)): for valid_set, name_valid_set in zip(valid_sets, name_valid_sets):
booster.add_valid(valid_sets[i], name_valid_sets[i]) booster.add_valid(valid_set, name_valid_set)
"""start training""" """start training"""
for i in range(num_boost_round): for i in range(num_boost_round):
for cb in callbacks_before_iter: for cb in callbacks_before_iter:
...@@ -209,9 +221,9 @@ def train(params, train_data, num_boost_round=100, ...@@ -209,9 +221,9 @@ def train(params, train_data, num_boost_round=100,
except callback.EarlyStopException: except callback.EarlyStopException:
break break
if booster.attr('best_iteration') is not None: if booster.attr('best_iteration') is not None:
booster.best_iteration = int(booster.attr('best_iteration')) booster.best_iteration = int(booster.attr('best_iteration')) + 1
else: else:
booster.best_iteration = num_boost_round - 1 booster.best_iteration = num_boost_round
return booster return booster
...@@ -233,13 +245,14 @@ class CVBooster(object): ...@@ -233,13 +245,14 @@ class CVBooster(object):
return self.booster.eval_valid(feval) return self.booster.eval_valid(feval)
try: try:
try: from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold, StratifiedKFold
except ImportError:
from sklearn.cross_validation import KFold, StratifiedKFold
SKLEARN_StratifiedKFold = True SKLEARN_StratifiedKFold = True
except ImportError: except ImportError:
SKLEARN_StratifiedKFold = False try:
from sklearn.cross_validation import StratifiedKFold
SKLEARN_StratifiedKFold = True
except ImportError:
SKLEARN_StratifiedKFold = False
def _make_n_folds(full_data, nfold, param, seed, fpreproc=None, stratified=False): def _make_n_folds(full_data, nfold, param, seed, fpreproc=None, stratified=False):
""" """
...@@ -270,7 +283,6 @@ def _make_n_folds(full_data, nfold, param, seed, fpreproc=None, stratified=False ...@@ -270,7 +283,6 @@ def _make_n_folds(full_data, nfold, param, seed, fpreproc=None, stratified=False
return ret return ret
def _agg_cv_result(raw_results): def _agg_cv_result(raw_results):
# pylint: disable=invalid-name
""" """
Aggregate cross-validation results. Aggregate cross-validation results.
""" """
...@@ -294,7 +306,6 @@ def cv(params, train_data, num_boost_round=10, nfold=5, stratified=False, ...@@ -294,7 +306,6 @@ def cv(params, train_data, num_boost_round=10, nfold=5, stratified=False,
metrics=(), fobj=None, feval=None, train_fields=None, early_stopping_rounds=None, metrics=(), fobj=None, feval=None, train_fields=None, early_stopping_rounds=None,
fpreproc=None, verbose_eval=None, show_stdv=True, seed=0, fpreproc=None, verbose_eval=None, show_stdv=True, seed=0,
callbacks=None): callbacks=None):
# pylint: disable = invalid-name
"""Cross-validation with given paramaters. """Cross-validation with given paramaters.
Parameters Parameters
...@@ -351,7 +362,7 @@ def cv(params, train_data, num_boost_round=10, nfold=5, stratified=False, ...@@ -351,7 +362,7 @@ def cv(params, train_data, num_boost_round=10, nfold=5, stratified=False,
if isinstance(params, list): if isinstance(params, list):
params = dict(params) params = dict(params)
if not 'metric' in params: if 'metric' not in params:
params['metric'] = [] params['metric'] = []
else: else:
if is_str(params['metric']): if is_str(params['metric']):
...@@ -410,7 +421,7 @@ def cv(params, train_data, num_boost_round=10, nfold=5, stratified=False, ...@@ -410,7 +421,7 @@ def cv(params, train_data, num_boost_round=10, nfold=5, stratified=False,
end_iteration=num_boost_round, end_iteration=num_boost_round,
evaluation_result_list=res)) evaluation_result_list=res))
except callback.EarlyStopException as e: except callback.EarlyStopException as e:
for k in results.keys(): for k in results:
results[k] = results[k][:(e.best_iteration + 1)] results[k] = results[k][:(e.best_iteration + 1)]
break break
return results return results
# coding: utf-8
"""Find the path to lightgbm dynamic library files."""
import os import os
import platform
import sys import sys
......
# coding: utf-8
# pylint: disable = invalid-name, W0105
"""Scikit-Learn Wrapper interface for LightGBM.""" """Scikit-Learn Wrapper interface for LightGBM."""
from __future__ import absolute_import from __future__ import absolute_import
...@@ -81,9 +83,9 @@ class LGBMModel(LGBMModelBase): ...@@ -81,9 +83,9 @@ class LGBMModel(LGBMModelBase):
num_leaves : int num_leaves : int
Maximum tree leaves for base learners. Maximum tree leaves for base learners.
max_depth : int max_depth : int
Maximum tree depth for base learners, -1 means no limit. Maximum tree depth for base learners, -1 means no limit.
learning_rate : float learning_rate : float
Boosting learning rate Boosting learning rate
n_estimators : int n_estimators : int
Number of boosted trees to fit. Number of boosted trees to fit.
silent : boolean silent : boolean
...@@ -92,7 +94,7 @@ class LGBMModel(LGBMModelBase): ...@@ -92,7 +94,7 @@ class LGBMModel(LGBMModelBase):
Specify the learning task and the corresponding learning objective or Specify the learning task and the corresponding learning objective or
a custom objective function to be used (see note below). a custom objective function to be used (see note below).
nthread : int nthread : int
Number of parallel threads Number of parallel threads
min_split_gain : float min_split_gain : float
Minimum loss reduction required to make a further partition on a leaf node of the tree. Minimum loss reduction required to make a further partition on a leaf node of the tree.
min_child_weight : int min_child_weight : int
...@@ -105,9 +107,9 @@ class LGBMModel(LGBMModelBase): ...@@ -105,9 +107,9 @@ class LGBMModel(LGBMModelBase):
frequence of subsample, <=0 means no enable frequence of subsample, <=0 means no enable
colsample_bytree : float colsample_bytree : float
Subsample ratio of columns when constructing each tree. Subsample ratio of columns when constructing each tree.
reg_alpha : float reg_alpha : float
L1 regularization term on weights L1 regularization term on weights
reg_lambda : float reg_lambda : float
L2 regularization term on weights L2 regularization term on weights
scale_pos_weight : float scale_pos_weight : float
Balancing of positive and negative weights. Balancing of positive and negative weights.
...@@ -122,7 +124,7 @@ class LGBMModel(LGBMModelBase): ...@@ -122,7 +124,7 @@ class LGBMModel(LGBMModelBase):
parameter. In this case, it should have the signature parameter. In this case, it should have the signature
``objective(y_true, y_pred) -> grad, hess``: ``objective(y_true, y_pred) -> grad, hess``:
y_true: array_like of shape [n_samples] y_true: array_like of shape [n_samples]
The target values The target values
y_pred: array_like of shape [n_samples] or shape[n_samples* n_class] y_pred: array_like of shape [n_samples] or shape[n_samples* n_class]
The predicted values The predicted values
...@@ -137,12 +139,12 @@ class LGBMModel(LGBMModelBase): ...@@ -137,12 +139,12 @@ class LGBMModel(LGBMModelBase):
and you should group grad and hess in this way as well and you should group grad and hess in this way as well
""" """
def __init__(self, num_leaves=31, max_depth=-1, def __init__(self, num_leaves=31, max_depth=-1,
learning_rate=0.1, n_estimators=10, max_bin=255, learning_rate=0.1, n_estimators=10, max_bin=255,
silent=True, objective="regression", silent=True, objective="regression",
nthread=-1, min_split_gain=0, min_child_weight=5, min_child_samples=10, nthread=-1, min_split_gain=0, min_child_weight=5, min_child_samples=10,
subsample=1, subsample_freq=1, colsample_bytree=1, subsample=1, subsample_freq=1, colsample_bytree=1,
reg_alpha=0, reg_lambda=0, scale_pos_weight=1, reg_alpha=0, reg_lambda=0, scale_pos_weight=1,
is_unbalance=False, seed=0): is_unbalance=False, seed=0):
if not SKLEARN_INSTALLED: if not SKLEARN_INSTALLED:
raise LightGBMError('sklearn needs to be installed in order to use this module') raise LightGBMError('sklearn needs to be installed in order to use this module')
...@@ -220,7 +222,8 @@ class LGBMModel(LGBMModelBase): ...@@ -220,7 +222,8 @@ class LGBMModel(LGBMModelBase):
other data file in training data. e.g. train_fields['weight'] is weight data other data file in training data. e.g. train_fields['weight'] is weight data
support fields: weight, group, init_score support fields: weight, group, init_score
valid_fields : dict valid_fields : dict
other data file in training data. e.g. valid_fields[0]['weight'] is weight data for first valid data other data file in training data. \
e.g. valid_fields[0]['weight'] is weight data for first valid data
support fields: weight, group, init_score support fields: weight, group, init_score
other_params: dict other_params: dict
other parameters other parameters
...@@ -235,6 +238,13 @@ class LGBMModel(LGBMModelBase): ...@@ -235,6 +238,13 @@ class LGBMModel(LGBMModelBase):
params["objective"] = "None" params["objective"] = "None"
else: else:
params["objective"] = self.objective params["objective"] = self.objective
if eval_metric is None and eval_set is not None:
eval_metric = {
'regression': 'l2',
'binary': 'binary_logloss',
'lambdarank': 'ndcg',
'multiclass': 'multi_logloss'
}.get(self.objective, None)
if callable(eval_metric): if callable(eval_metric):
feval = eval_metric feval = eval_metric
...@@ -249,7 +259,8 @@ class LGBMModel(LGBMModelBase): ...@@ -249,7 +259,8 @@ class LGBMModel(LGBMModelBase):
self.n_estimators, valid_datas=eval_set, self.n_estimators, valid_datas=eval_set,
early_stopping_rounds=early_stopping_rounds, early_stopping_rounds=early_stopping_rounds,
evals_result=evals_result, fobj=self.fobj, feval=feval, evals_result=evals_result, fobj=self.fobj, feval=feval,
verbose_eval=verbose, train_fields=train_fields, valid_fields=valid_fields) verbose_eval=verbose, train_fields=train_fields,
valid_fields=valid_fields)
if evals_result: if evals_result:
for val in evals_result.items(): for val in evals_result.items():
...@@ -320,14 +331,18 @@ class LGBMClassifier(LGBMModel, LGBMClassifierBase): ...@@ -320,14 +331,18 @@ class LGBMClassifier(LGBMModel, LGBMClassifierBase):
# Switch to using a multiclass objective in the underlying LGBM instance # Switch to using a multiclass objective in the underlying LGBM instance
self.objective = "multiclass" self.objective = "multiclass"
other_params['num_class'] = self.n_classes_ other_params['num_class'] = self.n_classes_
if eval_metric is None and eval_set is not None:
eval_metric = "multi_logloss"
else: else:
self.objective = "binary" self.objective = "binary"
if eval_metric is None and eval_set is not None:
eval_metric = "binary_logloss"
self._le = LGBMLabelEncoder().fit(y) self._le = LGBMLabelEncoder().fit(y)
training_labels = self._le.transform(y) training_labels = self._le.transform(y)
if eval_set is not None: if eval_set is not None:
eval_set = list( (x[0], self._le.transform(x[1])) for x in eval_set ) eval_set = list((x[0], self._le.transform(x[1])) for x in eval_set)
super(LGBMClassifier, self).fit(X, training_labels, eval_set, super(LGBMClassifier, self).fit(X, training_labels, eval_set,
eval_metric, early_stopping_rounds, eval_metric, early_stopping_rounds,
...@@ -430,6 +445,8 @@ class LGBMRanker(LGBMModel): ...@@ -430,6 +445,8 @@ class LGBMRanker(LGBMModel):
else: else:
self.objective = "lambdarank" self.objective = "lambdarank"
self.fobj = None self.fobj = None
if eval_metric is None and eval_set is not None:
eval_metric = "ndcg"
super(LGBMRanker, self).fit(X, y, eval_set, eval_metric, super(LGBMRanker, self).fit(X, y, eval_set, eval_metric,
early_stopping_rounds, verbose, early_stopping_rounds, verbose,
......
# coding: utf-8
# pylint: disable=invalid-name, exec-used # pylint: disable=invalid-name, exec-used
"""Setup lightgbm package.""" """Setup lightgbm package."""
from __future__ import absolute_import from __future__ import absolute_import
......
...@@ -393,6 +393,36 @@ void GBDT::Boosting() { ...@@ -393,6 +393,36 @@ void GBDT::Boosting() {
GetGradients(GetTrainingScore(&num_score), gradients_.data(), hessians_.data()); GetGradients(GetTrainingScore(&num_score), gradients_.data(), hessians_.data());
} }
std::string GBDT::DumpModel() const {
std::stringstream ss;
ss << "{";
ss << "\"name\":\"" << Name() << "\"," << std::endl;
ss << "\"num_class\":" << num_class_ << "," << std::endl;
ss << "\"label_index\":" << label_idx_ << "," << std::endl;
ss << "\"max_feature_idx\":" << max_feature_idx_ << "," << std::endl;
if (object_function_ != nullptr) {
ss << "\"objective\":\"" << object_function_->GetName() << "\"," << std::endl;
}
ss << "\"sigmoid\":" << sigmoid_ << "," << std::endl;
ss << "\"tree_info\":[";
for (int i = 0; i < static_cast<int>(models_.size()); ++i) {
if (i > 0) {
ss << ",";
}
ss << "{";
ss << "\"tree_index\":" << i << ",";
ss << models_[i]->ToJSON();
ss << "}";
}
ss << "]" << std::endl;
ss << "}" << std::endl;
return ss.str();
}
void GBDT::SaveModelToFile(int num_iteration, const char* filename) const { void GBDT::SaveModelToFile(int num_iteration, const char* filename) const {
/*! \brief File to write models */ /*! \brief File to write models */
std::ofstream output_file; std::ofstream output_file;
...@@ -426,7 +456,11 @@ void GBDT::SaveModelToFile(int num_iteration, const char* filename) const { ...@@ -426,7 +456,11 @@ void GBDT::SaveModelToFile(int num_iteration, const char* filename) const {
output_file << models_[i]->ToString() << std::endl; output_file << models_[i]->ToString() << std::endl;
} }
output_file << std::endl << FeatureImportance() << std::endl; std::vector<std::pair<size_t, std::string>> pairs = FeatureImportance();
output_file << std::endl << "feature importances:" << std::endl;
for (size_t i = 0; i < pairs.size(); ++i) {
output_file << pairs[i].second << "=" << std::to_string(pairs[i].first) << std::endl;
}
output_file.close(); output_file.close();
} }
...@@ -487,7 +521,7 @@ void GBDT::LoadModelFromString(const std::string& model_str) { ...@@ -487,7 +521,7 @@ void GBDT::LoadModelFromString(const std::string& model_str) {
num_init_iteration_ = num_iteration_for_pred_; num_init_iteration_ = num_iteration_for_pred_;
} }
std::string GBDT::FeatureImportance() const { std::vector<std::pair<size_t, std::string>> GBDT::FeatureImportance() const {
std::vector<size_t> feature_importances(max_feature_idx_ + 1, 0); std::vector<size_t> feature_importances(max_feature_idx_ + 1, 0);
for (size_t iter = 0; iter < models_.size(); ++iter) { for (size_t iter = 0; iter < models_.size(); ++iter) {
for (int split_idx = 0; split_idx < models_[iter]->num_leaves() - 1; ++split_idx) { for (int split_idx = 0; split_idx < models_[iter]->num_leaves() - 1; ++split_idx) {
...@@ -507,13 +541,7 @@ std::string GBDT::FeatureImportance() const { ...@@ -507,13 +541,7 @@ std::string GBDT::FeatureImportance() const {
const std::pair<size_t, std::string>& rhs) { const std::pair<size_t, std::string>& rhs) {
return lhs.first > rhs.first; return lhs.first > rhs.first;
}); });
std::stringstream str_buf; return pairs;
// write to model file
str_buf << std::endl << "feature importances:" << std::endl;
for (size_t i = 0; i < pairs.size(); ++i) {
str_buf << pairs[i].second << "=" << std::to_string(pairs[i].first) << std::endl;
}
return str_buf.str();
} }
std::vector<double> GBDT::PredictRaw(const double* value) const { std::vector<double> GBDT::PredictRaw(const double* value) const {
......
...@@ -145,9 +145,16 @@ public: ...@@ -145,9 +145,16 @@ public:
std::vector<int> PredictLeafIndex(const double* value) const override; std::vector<int> PredictLeafIndex(const double* value) const override;
/*! /*!
* \brief save model to file * \brief Dump model to json format string
* \param num_iterations Iterations that want to save, -1 means save all * \return Json format string of model
* \param filename filename that want to save to */
std::string DumpModel() const override;
/*!
* \brief Save model to file
* \param num_used_model Number of model that want to save, -1 means save all
* \param is_finish Is training finished or not
* \param filename Filename that want to save to
*/ */
virtual void SaveModelToFile(int num_iterations, const char* filename) const override ; virtual void SaveModelToFile(int num_iterations, const char* filename) const override ;
...@@ -155,6 +162,7 @@ public: ...@@ -155,6 +162,7 @@ public:
* \brief Restore from a serialized string * \brief Restore from a serialized string
*/ */
void LoadModelFromString(const std::string& model_str) override; void LoadModelFromString(const std::string& model_str) override;
/*! /*!
* \brief Get max feature index of this model * \brief Get max feature index of this model
* \return Max feature index of this model * \return Max feature index of this model
...@@ -231,7 +239,7 @@ protected: ...@@ -231,7 +239,7 @@ protected:
* \brief Calculate feature importances * \brief Calculate feature importances
* \param last_iter Last tree use to calculate * \param last_iter Last tree use to calculate
*/ */
std::string FeatureImportance() const; std::vector<std::pair<size_t, std::string>> FeatureImportance() const;
/*! \brief current iteration */ /*! \brief current iteration */
int iter_; int iter_;
/*! \brief Pointer to training data */ /*! \brief Pointer to training data */
......
...@@ -139,6 +139,10 @@ public: ...@@ -139,6 +139,10 @@ public:
boosting_->SaveModelToFile(num_iteration, filename); boosting_->SaveModelToFile(num_iteration, filename);
} }
std::string DumpModel() {
return boosting_->DumpModel();
}
int GetEvalCounts() const { int GetEvalCounts() const {
int ret = 0; int ret = 0;
for (const auto& metric : train_metric_) { for (const auto& metric : train_metric_) {
...@@ -733,6 +737,20 @@ DllExport int LGBM_BoosterSaveModel(BoosterHandle handle, ...@@ -733,6 +737,20 @@ DllExport int LGBM_BoosterSaveModel(BoosterHandle handle,
API_END(); API_END();
} }
DllExport int LGBM_BoosterDumpModel(BoosterHandle handle,
int buffer_len,
int64_t* out_len,
char** out_str) {
API_BEGIN();
Booster* ref_booster = reinterpret_cast<Booster*>(handle);
std::string model = ref_booster->DumpModel();
*out_len = static_cast<int64_t>(model.size());
if (*out_len <= buffer_len) {
std::strcpy(*out_str, model.c_str());
}
API_END();
}
// ---- start of some help functions // ---- start of some help functions
std::function<std::vector<double>(int row_idx)> std::function<std::vector<double>(int row_idx)>
......
...@@ -125,6 +125,43 @@ std::string Tree::ToString() { ...@@ -125,6 +125,43 @@ std::string Tree::ToString() {
return ss.str(); return ss.str();
} }
std::string Tree::ToJSON() {
std::stringstream ss;
ss << "\"num_leaves\":" << num_leaves_ << "," << std::endl;
ss << "\"tree_structure\":" << NodeToJSON(0) << std::endl;
return ss.str();
}
std::string Tree::NodeToJSON(int index) {
std::stringstream ss;
if (index >= 0) {
// non-leaf
ss << "{" << std::endl;
ss << "\"split_index\":" << index << "," << std::endl;
ss << "\"split_feature\":" << split_feature_real_.data()[index] << "," << std::endl;
ss << "\"split_gain\":" << split_gain_.data()[index] << "," << std::endl;
ss << "\"threshold\":" << threshold_.data()[index] << "," << std::endl;
ss << "\"internal_value\":" << internal_value_.data()[index] << "," << std::endl;
ss << "\"left_child\":" << NodeToJSON(left_child_.data()[index]) << "," << std::endl;
ss << "\"right_child\":" << NodeToJSON(right_child_.data()[index]) << std::endl;
ss << "}";
} else {
// leaf
index = ~index;
ss << "{" << std::endl;
ss << "\"leaf_index\":" << index << "," << std::endl;
ss << "\"leaf_parent\":" << leaf_parent_.data()[index] << "," << std::endl;
ss << "\"leaf_value\":" << leaf_value_.data()[index] << std::endl;
ss << "}";
}
return ss.str();
}
Tree::Tree(const std::string& str) { Tree::Tree(const std::string& str) {
std::vector<std::string> lines = Common::Split(str.c_str(), '\n'); std::vector<std::string> lines = Common::Split(str.c_str(), '\n');
std::unordered_map<std::string, std::string> key_vals; std::unordered_map<std::string, std::string> key_vals;
......
# coding: utf-8
import numpy as np import numpy as np
from sklearn import datasets, metrics, model_selection from sklearn import datasets, metrics, model_selection
import lightgbm as lgb import lightgbm as lgb
X, Y = datasets.make_classification(n_samples=100000, n_features=100) X, Y = datasets.make_classification(n_samples=100000, n_features=100)
x_train, x_test, y_train, y_test = model_selection.train_test_split(X, Y, test_size=0.1) x_train, x_test, y_train, y_test = model_selection.train_test_split(X, Y, test_size=0.1)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment