Commit 9f4849b3 authored by Guolin Ke's avatar Guolin Ke Committed by GitHub
Browse files

Merge pull request #97 from wxchan/dev

Clean codes for python-package; dump model to JSON
parents fa51a676 69114525
......@@ -17,7 +17,7 @@ For more details, please refer to [Features](https://github.com/Microsoft/LightG
News
----
12/02/2012 : Release [python-package](https://github.com/Microsoft/LightGBM/tree/master/python-package) beta version, welcome to have a try and provide issues and feedback.
12/02/2016 : Release [python-package](https://github.com/Microsoft/LightGBM/tree/master/python-package) beta version, welcome to have a try and provide issues and feedback.
Get Started
------------
......
Python Package Example
=====================
Here is an example for LightGBM to use python package.
***You should install lightgbm (both c++ and python verion) first.***
For the installation, check the wiki [here](https://github.com/Microsoft/LightGBM/wiki/Installation-Guide).
You also need scikit-learn and pandas to run the examples, but they are not required for the package itself. You can install them with pip:
```
pip install -U scikit-learn
pip install -U pandas
```
Now you can run examples in this folder, for example:
```
python simple_example.py
```
import numpy as np
import random
# coding: utf-8
# pylint: disable = invalid-name, C0111
import json
import lightgbm as lgb
from sklearn import datasets, metrics, model_selection
rng = np.random.RandomState(2016)
X, y = datasets.make_classification(n_samples=10000, n_features=100)
x_train, x_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.1, random_state=1)
lgb_model = lgb.LGBMClassifier(n_estimators=100).fit(x_train, y_train, [(x_test, y_test)], eval_metric="auc")
lgb_model.predict(x_test)
# save model
lgb_model.booster().save_model('model.txt')
# load model
booster = lgb.Booster(model_file='model.txt')
import pandas as pd
from sklearn.metrics import mean_squared_error
# load or create your dataset
df_train = pd.read_csv('../regression/regression.train', header=None, sep='\t')
df_test = pd.read_csv('../regression/regression.test', header=None, sep='\t')
y_train = df_train[0]
y_test = df_test[0]
X_train = df_train.drop(0, axis=1)
X_test = df_test.drop(0, axis=1)
# create dataset for lightgbm
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
# or you can simply use a tuple of length=2 here
lgb_train = (X_train, y_train)
lgb_eval = (X_test, y_test)
# specify your configurations as a dict
params = {
'task' : 'train',
'boosting_type' : 'gbdt',
'objective' : 'regression',
'metric' : 'l2',
'num_leaves' : 31,
'learning_rate' : 0.05,
'feature_fraction' : 0.9,
'bagging_fraction' : 0.8,
'bagging_freq': 5,
# 'ndcg_eval_at' : [1, 3, 5, 10],
# this metric is not needed in this task, show as an example
'verbose' : 0
}
# train
gbm = lgb.train(params,
lgb_train,
num_boost_round=100,
valid_datas=lgb_eval,
# you can use a list to represent multiple valid_datas/valid_names
# don't use tuple, tuple is used to represent one dataset
early_stopping_rounds=10)
# save model to file
gbm.save_model('model.txt')
# load model from file
gbm = lgb.Booster(model_file='model.txt')
# predict
print(booster.predict(x_test))
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
# eval
print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)
# dump model to json (and save to file)
model_json = gbm.dump_model()
with open('model.json', 'w+') as f:
json.dump(model_json, f, indent=4)
# coding: utf-8
# pylint: disable = invalid-name, C0111
import lightgbm as lgb
import pandas as pd
from sklearn.metrics import mean_squared_error
# load or create your dataset
df_train = pd.read_csv('../regression/regression.train', header=None, sep='\t')
df_test = pd.read_csv('../regression/regression.test', header=None, sep='\t')
y_train = df_train[0]
y_test = df_test[0]
X_train = df_train.drop(0, axis=1)
X_test = df_test.drop(0, axis=1)
# train
gbm = lgb.LGBMRegressor(objective='regression',
num_leaves=31,
learning_rate=0.05,
n_estimators=100)
gbm.fit(X_train, y_train,
eval_set=[(X_test, y_test)],
early_stopping_rounds=10)
# predict
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
# eval
print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)
......@@ -133,9 +133,16 @@ public:
const double* feature_values) const = 0;
/*!
* \brief save model to file
* \param num_iterations Iterations that want to save, -1 means save all
* \param filename filename that want to save to
* \brief Dump model to json format string
* \return Json format string of model
*/
virtual std::string DumpModel() const = 0;
/*!
* \brief Save model to file
* \param num_used_model Number of model that want to save, -1 means save all
* \param is_finish Is training finished or not
* \param filename Filename that want to save to
*/
virtual void SaveModelToFile(int num_iterations, const char* filename) const = 0;
......
......@@ -474,7 +474,18 @@ DllExport int LGBM_BoosterSaveModel(BoosterHandle handle,
int num_iteration,
const char* filename);
/*!
* \brief dump model to json
* \param handle handle
* \param buffer_len string buffer length, if buffer_len < out_len, re-allocate buffer
* \param out_len actual output length
* \param out_str json format string of model
* \return 0 when succeed, -1 when failure happens
*/
DllExport int LGBM_BoosterDumpModel(BoosterHandle handle,
int buffer_len,
int64_t* out_len,
char** out_str);
// some help functions used to convert data
......
......@@ -98,9 +98,12 @@ public:
}
}
/*! \brief Serialize this object by string*/
/*! \brief Serialize this object to string*/
std::string ToString();
/*! \brief Serialize this object to json*/
std::string ToJSON();
private:
/*!
* \brief Find leaf index of which record belongs by data
......@@ -118,6 +121,9 @@ private:
*/
inline int GetLeaf(const double* feature_values) const;
/*! \brief Serialize one node to json*/
inline std::string NodeToJSON(int index);
/*! \brief Number of max leaves*/
int max_leaves_;
/*! \brief Number of current levas*/
......@@ -137,13 +143,13 @@ private:
std::vector<double> threshold_;
/*! \brief A non-leaf node's split gain */
std::vector<double> split_gain_;
/*! \brief Output of internal nodes(save internal output for per inference feature importance calc) */
std::vector<double> internal_value_;
// used for leaf node
/*! \brief The parent of leaf */
std::vector<int> leaf_parent_;
/*! \brief Output of leaves */
std::vector<double> leaf_value_;
/*! \brief Output of internal nodes(save internal output for per inference feature importance calc) */
std::vector<double> internal_value_;
/*! \brief Depth for leaves */
std::vector<int> leaf_depth_;
};
......
......@@ -20,4 +20,5 @@ __version__ = 0.1
__all__ = ['Dataset', 'Booster',
'train', 'cv',
'LGBMModel','LGBMRegressor', 'LGBMClassifier', 'LGBMRanker']
\ No newline at end of file
'LGBMModel', 'LGBMRegressor', 'LGBMClassifier', 'LGBMRanker']
# coding: utf-8
# pylint: disable = invalid-name, C0111, R0912, R0913, R0914, W0105
"""Wrapper c_api of LightGBM"""
from __future__ import absolute_import
......@@ -5,12 +7,24 @@ import sys
import os
import ctypes
import tempfile
import json
import numpy as np
import scipy.sparse
from .libpath import find_lib_path
# pandas
try:
from pandas import Series, DataFrame
IS_PANDAS_INSTALLED = True
except ImportError:
IS_PANDAS_INSTALLED = False
class Series(object):
pass
class DataFrame(object):
pass
IS_PY3 = (sys.version_info[0] == 3)
def _load_lib():
......@@ -69,6 +83,8 @@ def list_to_1d_numpy(data, dtype):
return data.astype(dtype=dtype, copy=False)
elif is_1d_list(data):
return np.array(data, dtype=dtype, copy=False)
elif IS_PANDAS_INSTALLED and isinstance(data, Series):
return data.astype(dtype).values
else:
raise TypeError("Unknow type({})".format(type(data).__name__))
......@@ -110,7 +126,7 @@ def param_dict_to_str(data):
elif isinstance(val, (int, float, bool)):
pairs.append(str(key)+'='+str(val))
else:
raise TypeError('unknow type of parameter:%s , got:%s'
raise TypeError('unknow type of parameter:%s , got:%s'
% (key, type(val).__name__))
return ' '.join(pairs)
"""marco definition of data type in c_api of LightGBM"""
......@@ -183,7 +199,7 @@ class Predictor(object):
"""Prediction task"""
out_num_iterations = ctypes.c_int64(0)
_safe_call(_LIB.LGBM_BoosterCreateFromModelfile(
c_str(model_file),
c_str(model_file),
ctypes.byref(out_num_iterations),
ctypes.byref(self.handle)))
out_num_class = ctypes.c_int64(0)
......@@ -357,7 +373,7 @@ class Predictor(object):
type_ptr_data,
len(csr.indptr),
len(csr.data),
csr.shape[1],
csr.shape[1],
predict_type,
num_iteration,
ctypes.byref(out_num_preds),
......@@ -367,13 +383,6 @@ class Predictor(object):
raise ValueError("incorrect number for predict result")
return preds, nrow
# pandas
try:
from pandas import DataFrame
except ImportError:
class DataFrame(object):
pass
PANDAS_DTYPE_MAPPER = {'int8': 'int', 'int16': 'int', 'int32': 'int',
'int64': 'int', 'uint8': 'int', 'uint16': 'int',
'uint32': 'int', 'uint64': 'int', 'float16': 'float',
......@@ -467,8 +476,8 @@ class Dataset(object):
self.data_has_header = True
self.handle = ctypes.c_void_p()
_safe_call(_LIB.LGBM_DatasetCreateFromFile(
c_str(data),
c_str(params_str),
c_str(data),
c_str(params_str),
ref_dataset,
ctypes.byref(self.handle)))
elif isinstance(data, scipy.sparse.csr_matrix):
......@@ -830,6 +839,7 @@ class Booster(object):
self.__is_manage_handle = True
self.__train_data_name = "training"
self.__attr = {}
self.best_iteration = -1
params = {} if params is None else params
if silent:
params["verbose"] = 0
......@@ -1018,7 +1028,7 @@ class Booster(object):
self.handle,
ctypes.byref(out_cur_iter)))
return out_cur_iter.value
def eval(self, data, name, feval=None):
"""Evaluate for data
......@@ -1098,6 +1108,34 @@ class Booster(object):
num_iteration,
c_str(filename)))
def dump_model(self):
"""
Dump model to json format
Returns
-------
Json format of model
"""
buffer_len = 1 << 20
tmp_out_len = ctypes.c_int64(0)
string_buffer = ctypes.create_string_buffer(buffer_len)
ptr_string_buffer = ctypes.c_char_p(*[ctypes.addressof(string_buffer)])
_safe_call(_LIB.LGBM_BoosterDumpModel(
self.handle,
buffer_len,
ctypes.byref(tmp_out_len),
ctypes.byref(ptr_string_buffer)))
actual_len = tmp_out_len.value
if actual_len > buffer_len:
string_buffer = ctypes.create_string_buffer(actual_len)
ptr_string_buffer = ctypes.c_char_p(*[ctypes.addressof(string_buffer)])
_safe_call(_LIB.LGBM_BoosterDumpModel(
self.handle,
actual_len,
ctypes.byref(tmp_out_len),
ctypes.byref(ptr_string_buffer)))
return json.loads(string_buffer.value.decode())
def predict(self, data, num_iteration=-1, raw_score=False, pred_leaf=False, data_has_header=False, is_reshape=True):
"""
Predict logic
......@@ -1147,7 +1185,7 @@ class Booster(object):
_safe_call(_LIB.LGBM_BoosterGetEval(
self.handle,
data_idx,
ctypes.byref(tmp_out_len),
ctypes.byref(tmp_out_len),
result.ctypes.data_as(ctypes.POINTER(ctypes.c_float))))
if tmp_out_len.value != self.__num_inner_eval:
raise ValueError("incorrect number of eval results")
......@@ -1190,7 +1228,7 @@ class Booster(object):
ctypes.byref(tmp_out_len),
data_ptr))
if tmp_out_len.value != len(self.__inner_predict_buffer[data_idx]):
raise ValueError("incorrect number of predict results for data %d" % (data_idx) )
raise ValueError("incorrect number of predict results for data %d" % (data_idx))
self.__is_predicted_cur_iter[data_idx] = True
return self.__inner_predict_buffer[data_idx]
......
# coding: utf-8
# pylint: disable = invalid-name, W0105
from __future__ import absolute_import
import collections
......@@ -25,12 +27,12 @@ CallbackEnv = collections.namedtuple(
def _format_eval_result(value, show_stdv=True):
"""format metric string"""
if len(value) == 4:
return '%s_%s:%g' % (value[0], value[1], value[2])
return '%s\'s %s:%g' % (value[0], value[1], value[2])
elif len(value) == 5:
if show_stdv:
return '%s_%s:%g+%g' % (value[0], value[1], value[2], value[4])
return '%s\'s %s:%g+%g' % (value[0], value[1], value[2], value[4])
else:
return '%s_%s:%g' % (value[0], value[1], value[2])
return '%s\'s %s:%g' % (value[0], value[1], value[2])
else:
raise ValueError("wrong metric value")
......@@ -55,9 +57,10 @@ def print_evaluation(period=1, show_stdv=True):
"""internal function"""
if len(env.evaluation_result_list) == 0 or period is False:
return
if (env.iteration % period == 0 or env.iteration + 1 == env.begin_iteration):
result = '\t'.join([_format_eval_result(x, show_stdv) for x in env.evaluation_result_list])
print('[%d]\t%s\n' % (env.iteration, result))
if env.iteration % period == 0 or env.iteration + 1 == env.begin_iteration:
result = '\t'.join([_format_eval_result(x, show_stdv) \
for x in env.evaluation_result_list])
print('[%d]\t%s' % (env.iteration, result))
return callback
......@@ -131,13 +134,13 @@ def reset_learning_rate(learning_rates):
def early_stop(stopping_rounds, verbose=True):
"""Create a callback that activates early stopping.
Activates early stopping.
Activates early stopping.
Requires at least one validation data and one metric
If there's more than one, will check all of them
Parameters
----------
stopp_rounds : int
stopping_rounds : int
The stopping rounds before the trend occur.
verbose : optional, bool
......@@ -154,13 +157,11 @@ def early_stop(stopping_rounds, verbose=True):
best_msg = {}
def init(env):
"""internal function"""
bst = env.model
if len(env.evaluation_result_list) == 0:
raise ValueError('For early stopping you need at least one set in evals.')
if verbose:
msg = "Will train until hasn't improved in {} rounds.\n"
msg = "Train until valid scores didn't improve in {} rounds."
print(msg.format(stopping_rounds))
for i in range(len(env.evaluation_result_list)):
......@@ -182,13 +183,13 @@ def early_stop(stopping_rounds, verbose=True):
best_score[i] = score
best_iter[i] = env.iteration
if verbose:
best_msg[i] = '[%d]\t%s' % ( env.iteration,
best_msg[i] = '[%d]\t%s' % (env.iteration, \
'\t'.join([_format_eval_result(x) for x in env.evaluation_result_list]))
else:
if env.iteration - best_iter[i] >= stopping_rounds:
if env.model is not None:
env.model.set_attr(best_iteration=str(best_iter[i]))
if verbose:
print('early stopping, best message is:\n {} '.format(best_msg[i]))
print('early stopping, best iteration is:\n{}'.format(best_msg[i]))
raise EarlyStopException(best_iter[i])
return callback
# coding: utf-8
# pylint: disable = invalid-name, W0105
"""Training Library containing training routines of LightGBM."""
from __future__ import absolute_import
......@@ -6,7 +8,7 @@ from .basic import LightGBMError, Predictor, Dataset, Booster, is_str
from . import callback
def _construct_dataset(X_y, reference=None,
params=None, other_fields=None,
params=None, other_fields=None,
predictor=None):
if 'max_bin' in params:
max_bin = int(params['max_bin'])
......@@ -30,10 +32,9 @@ def _construct_dataset(X_y, reference=None,
data = X_y[0]
label = X_y[1]
if reference is None:
ret = Dataset(data, label=label, max_bin=max_bin,
ret = Dataset(data, label=label, max_bin=max_bin,
weight=weight, group=group,
predictor=predictor, params=params)
else:
ret = reference.create_valid(data, label=label, weight=weight,
group=group, params=params)
......@@ -53,11 +54,11 @@ def train(params, train_data, num_boost_round=100,
----------
params : dict
params.
train_data : pair, (X, y) or filename of data
train_data : Dataset, tuple (X, y) or filename of data
Data to be trained.
num_boost_round: int
Number of boosting iterations.
valid_datas: list of pairs (valid_X, valid_y) or filename of data
valid_datas: list of Datasets, tuples (valid_X, valid_y) or filename of data
List of data to be evaluated during training
valid_names: list of string
names of valid_datas
......@@ -72,18 +73,19 @@ def train(params, train_data, num_boost_round=100,
other data file in training data. e.g. train_fields['weight'] is weight data
support fields: weight, group, init_score
valid_fields : dict
other data file in training data. e.g. valid_fields[0]['weight'] is weight data for first valid data
other data file in training data. \
e.g. valid_fields[0]['weight'] is weight data for first valid data
support fields: weight, group, init_score
early_stopping_rounds: int
Activates early stopping.
Activates early stopping.
Requires at least one validation data and one metric
If there's more than one, will check all of them
Returns the model with (best_iter + early_stopping_rounds)
If early stopping occurs, the model will add 'best_iteration' field
evals_result: dict or None
This dictionary used to store all evaluation results of all the items in valid_datas.
Example: with a valid_datas containing [valid_set, train_set] and valid_names containing ['eval', 'train'] and
a paramater containing ('metric':'logloss')
Example: with a valid_datas containing [valid_set, train_set] \
and valid_names containing ['eval', 'train'] and a paramater containing ('metric':'logloss')
Returns: {'train': {'logloss': ['0.48253', '0.35953', ...]},
'eval': {'logloss': ['0.480385', '0.357756', ...]}}
passed with None means no using this function
......@@ -120,26 +122,36 @@ def train(params, train_data, num_boost_round=100,
else:
predictor = None
"""create dataset"""
train_set = _construct_dataset(train_data, None, params, train_fields, predictor)
if isinstance(train_data, Dataset):
train_set = train_data
else:
train_set = _construct_dataset(train_data, None, params, train_fields, predictor)
is_valid_contain_train = False
train_data_name = "training"
valid_sets = []
name_valid_sets = []
if valid_datas is not None:
for i in range(len(valid_datas)):
if isinstance(valid_datas, (Dataset, tuple)):
valid_datas = [valid_datas]
if isinstance(valid_names, str):
valid_names = [valid_names]
for i, valid_data in enumerate(valid_datas):
other_fields = None if valid_fields is None else valid_fields[i]
"""reduce cost for prediction training data"""
if valid_datas[i] is train_data:
if valid_data is train_data:
is_valid_contain_train = True
if valid_names is not None:
train_data_name = valid_names[i]
continue
valid_set = _construct_dataset(
valid_datas[i],
train_set,
params,
other_fields,
predictor)
if isinstance(valid_data, Dataset):
valid_set = valid_data
else:
valid_set = _construct_dataset(
valid_data,
train_set,
params,
other_fields,
predictor)
valid_sets.append(valid_set)
if valid_names is not None:
name_valid_sets.append(valid_names[i])
......@@ -178,8 +190,8 @@ def train(params, train_data, num_boost_round=100,
booster = Booster(params=params, train_set=train_set)
if is_valid_contain_train:
booster.set_train_data_name(train_data_name)
for i in range(len(valid_sets)):
booster.add_valid(valid_sets[i], name_valid_sets[i])
for valid_set, name_valid_set in zip(valid_sets, name_valid_sets):
booster.add_valid(valid_set, name_valid_set)
"""start training"""
for i in range(num_boost_round):
for cb in callbacks_before_iter:
......@@ -209,9 +221,9 @@ def train(params, train_data, num_boost_round=100,
except callback.EarlyStopException:
break
if booster.attr('best_iteration') is not None:
booster.best_iteration = int(booster.attr('best_iteration'))
booster.best_iteration = int(booster.attr('best_iteration')) + 1
else:
booster.best_iteration = num_boost_round - 1
booster.best_iteration = num_boost_round
return booster
......@@ -233,13 +245,14 @@ class CVBooster(object):
return self.booster.eval_valid(feval)
try:
try:
from sklearn.model_selection import KFold, StratifiedKFold
except ImportError:
from sklearn.cross_validation import KFold, StratifiedKFold
from sklearn.model_selection import StratifiedKFold
SKLEARN_StratifiedKFold = True
except ImportError:
SKLEARN_StratifiedKFold = False
try:
from sklearn.cross_validation import StratifiedKFold
SKLEARN_StratifiedKFold = True
except ImportError:
SKLEARN_StratifiedKFold = False
def _make_n_folds(full_data, nfold, param, seed, fpreproc=None, stratified=False):
"""
......@@ -270,7 +283,6 @@ def _make_n_folds(full_data, nfold, param, seed, fpreproc=None, stratified=False
return ret
def _agg_cv_result(raw_results):
# pylint: disable=invalid-name
"""
Aggregate cross-validation results.
"""
......@@ -294,7 +306,6 @@ def cv(params, train_data, num_boost_round=10, nfold=5, stratified=False,
metrics=(), fobj=None, feval=None, train_fields=None, early_stopping_rounds=None,
fpreproc=None, verbose_eval=None, show_stdv=True, seed=0,
callbacks=None):
# pylint: disable = invalid-name
"""Cross-validation with given paramaters.
Parameters
......@@ -351,7 +362,7 @@ def cv(params, train_data, num_boost_round=10, nfold=5, stratified=False,
if isinstance(params, list):
params = dict(params)
if not 'metric' in params:
if 'metric' not in params:
params['metric'] = []
else:
if is_str(params['metric']):
......@@ -410,7 +421,7 @@ def cv(params, train_data, num_boost_round=10, nfold=5, stratified=False,
end_iteration=num_boost_round,
evaluation_result_list=res))
except callback.EarlyStopException as e:
for k in results.keys():
for k in results:
results[k] = results[k][:(e.best_iteration + 1)]
break
return results
# coding: utf-8
"""Find the path to lightgbm dynamic library files."""
import os
import platform
import sys
......
# coding: utf-8
# pylint: disable = invalid-name, W0105
"""Scikit-Learn Wrapper interface for LightGBM."""
from __future__ import absolute_import
......@@ -81,9 +83,9 @@ class LGBMModel(LGBMModelBase):
num_leaves : int
Maximum tree leaves for base learners.
max_depth : int
Maximum tree depth for base learners, -1 means no limit.
Maximum tree depth for base learners, -1 means no limit.
learning_rate : float
Boosting learning rate
Boosting learning rate
n_estimators : int
Number of boosted trees to fit.
silent : boolean
......@@ -92,7 +94,7 @@ class LGBMModel(LGBMModelBase):
Specify the learning task and the corresponding learning objective or
a custom objective function to be used (see note below).
nthread : int
Number of parallel threads
Number of parallel threads
min_split_gain : float
Minimum loss reduction required to make a further partition on a leaf node of the tree.
min_child_weight : int
......@@ -105,9 +107,9 @@ class LGBMModel(LGBMModelBase):
frequence of subsample, <=0 means no enable
colsample_bytree : float
Subsample ratio of columns when constructing each tree.
reg_alpha : float
reg_alpha : float
L1 regularization term on weights
reg_lambda : float
reg_lambda : float
L2 regularization term on weights
scale_pos_weight : float
Balancing of positive and negative weights.
......@@ -122,7 +124,7 @@ class LGBMModel(LGBMModelBase):
parameter. In this case, it should have the signature
``objective(y_true, y_pred) -> grad, hess``:
y_true: array_like of shape [n_samples]
y_true: array_like of shape [n_samples]
The target values
y_pred: array_like of shape [n_samples] or shape[n_samples* n_class]
The predicted values
......@@ -137,12 +139,12 @@ class LGBMModel(LGBMModelBase):
and you should group grad and hess in this way as well
"""
def __init__(self, num_leaves=31, max_depth=-1,
def __init__(self, num_leaves=31, max_depth=-1,
learning_rate=0.1, n_estimators=10, max_bin=255,
silent=True, objective="regression",
silent=True, objective="regression",
nthread=-1, min_split_gain=0, min_child_weight=5, min_child_samples=10,
subsample=1, subsample_freq=1, colsample_bytree=1,
reg_alpha=0, reg_lambda=0, scale_pos_weight=1,
reg_alpha=0, reg_lambda=0, scale_pos_weight=1,
is_unbalance=False, seed=0):
if not SKLEARN_INSTALLED:
raise LightGBMError('sklearn needs to be installed in order to use this module')
......@@ -220,7 +222,8 @@ class LGBMModel(LGBMModelBase):
other data file in training data. e.g. train_fields['weight'] is weight data
support fields: weight, group, init_score
valid_fields : dict
other data file in training data. e.g. valid_fields[0]['weight'] is weight data for first valid data
other data file in training data. \
e.g. valid_fields[0]['weight'] is weight data for first valid data
support fields: weight, group, init_score
other_params: dict
other parameters
......@@ -235,6 +238,13 @@ class LGBMModel(LGBMModelBase):
params["objective"] = "None"
else:
params["objective"] = self.objective
if eval_metric is None and eval_set is not None:
eval_metric = {
'regression': 'l2',
'binary': 'binary_logloss',
'lambdarank': 'ndcg',
'multiclass': 'multi_logloss'
}.get(self.objective, None)
if callable(eval_metric):
feval = eval_metric
......@@ -249,7 +259,8 @@ class LGBMModel(LGBMModelBase):
self.n_estimators, valid_datas=eval_set,
early_stopping_rounds=early_stopping_rounds,
evals_result=evals_result, fobj=self.fobj, feval=feval,
verbose_eval=verbose, train_fields=train_fields, valid_fields=valid_fields)
verbose_eval=verbose, train_fields=train_fields,
valid_fields=valid_fields)
if evals_result:
for val in evals_result.items():
......@@ -320,14 +331,18 @@ class LGBMClassifier(LGBMModel, LGBMClassifierBase):
# Switch to using a multiclass objective in the underlying LGBM instance
self.objective = "multiclass"
other_params['num_class'] = self.n_classes_
if eval_metric is None and eval_set is not None:
eval_metric = "multi_logloss"
else:
self.objective = "binary"
if eval_metric is None and eval_set is not None:
eval_metric = "binary_logloss"
self._le = LGBMLabelEncoder().fit(y)
training_labels = self._le.transform(y)
if eval_set is not None:
eval_set = list( (x[0], self._le.transform(x[1])) for x in eval_set )
eval_set = list((x[0], self._le.transform(x[1])) for x in eval_set)
super(LGBMClassifier, self).fit(X, training_labels, eval_set,
eval_metric, early_stopping_rounds,
......@@ -430,6 +445,8 @@ class LGBMRanker(LGBMModel):
else:
self.objective = "lambdarank"
self.fobj = None
if eval_metric is None and eval_set is not None:
eval_metric = "ndcg"
super(LGBMRanker, self).fit(X, y, eval_set, eval_metric,
early_stopping_rounds, verbose,
......
# coding: utf-8
# pylint: disable=invalid-name, exec-used
"""Setup lightgbm package."""
from __future__ import absolute_import
......
......@@ -393,6 +393,36 @@ void GBDT::Boosting() {
GetGradients(GetTrainingScore(&num_score), gradients_.data(), hessians_.data());
}
std::string GBDT::DumpModel() const {
std::stringstream ss;
ss << "{";
ss << "\"name\":\"" << Name() << "\"," << std::endl;
ss << "\"num_class\":" << num_class_ << "," << std::endl;
ss << "\"label_index\":" << label_idx_ << "," << std::endl;
ss << "\"max_feature_idx\":" << max_feature_idx_ << "," << std::endl;
if (object_function_ != nullptr) {
ss << "\"objective\":\"" << object_function_->GetName() << "\"," << std::endl;
}
ss << "\"sigmoid\":" << sigmoid_ << "," << std::endl;
ss << "\"tree_info\":[";
for (int i = 0; i < static_cast<int>(models_.size()); ++i) {
if (i > 0) {
ss << ",";
}
ss << "{";
ss << "\"tree_index\":" << i << ",";
ss << models_[i]->ToJSON();
ss << "}";
}
ss << "]" << std::endl;
ss << "}" << std::endl;
return ss.str();
}
void GBDT::SaveModelToFile(int num_iteration, const char* filename) const {
/*! \brief File to write models */
std::ofstream output_file;
......@@ -426,7 +456,11 @@ void GBDT::SaveModelToFile(int num_iteration, const char* filename) const {
output_file << models_[i]->ToString() << std::endl;
}
output_file << std::endl << FeatureImportance() << std::endl;
std::vector<std::pair<size_t, std::string>> pairs = FeatureImportance();
output_file << std::endl << "feature importances:" << std::endl;
for (size_t i = 0; i < pairs.size(); ++i) {
output_file << pairs[i].second << "=" << std::to_string(pairs[i].first) << std::endl;
}
output_file.close();
}
......@@ -487,7 +521,7 @@ void GBDT::LoadModelFromString(const std::string& model_str) {
num_init_iteration_ = num_iteration_for_pred_;
}
std::string GBDT::FeatureImportance() const {
std::vector<std::pair<size_t, std::string>> GBDT::FeatureImportance() const {
std::vector<size_t> feature_importances(max_feature_idx_ + 1, 0);
for (size_t iter = 0; iter < models_.size(); ++iter) {
for (int split_idx = 0; split_idx < models_[iter]->num_leaves() - 1; ++split_idx) {
......@@ -507,13 +541,7 @@ std::string GBDT::FeatureImportance() const {
const std::pair<size_t, std::string>& rhs) {
return lhs.first > rhs.first;
});
std::stringstream str_buf;
// write to model file
str_buf << std::endl << "feature importances:" << std::endl;
for (size_t i = 0; i < pairs.size(); ++i) {
str_buf << pairs[i].second << "=" << std::to_string(pairs[i].first) << std::endl;
}
return str_buf.str();
return pairs;
}
std::vector<double> GBDT::PredictRaw(const double* value) const {
......
......@@ -145,9 +145,16 @@ public:
std::vector<int> PredictLeafIndex(const double* value) const override;
/*!
* \brief save model to file
* \param num_iterations Iterations that want to save, -1 means save all
* \param filename filename that want to save to
* \brief Dump model to json format string
* \return Json format string of model
*/
std::string DumpModel() const override;
/*!
* \brief Save model to file
* \param num_used_model Number of model that want to save, -1 means save all
* \param is_finish Is training finished or not
* \param filename Filename that want to save to
*/
virtual void SaveModelToFile(int num_iterations, const char* filename) const override ;
......@@ -155,6 +162,7 @@ public:
* \brief Restore from a serialized string
*/
void LoadModelFromString(const std::string& model_str) override;
/*!
* \brief Get max feature index of this model
* \return Max feature index of this model
......@@ -231,7 +239,7 @@ protected:
* \brief Calculate feature importances
* \param last_iter Last tree use to calculate
*/
std::string FeatureImportance() const;
std::vector<std::pair<size_t, std::string>> FeatureImportance() const;
/*! \brief current iteration */
int iter_;
/*! \brief Pointer to training data */
......
......@@ -139,6 +139,10 @@ public:
boosting_->SaveModelToFile(num_iteration, filename);
}
std::string DumpModel() {
return boosting_->DumpModel();
}
int GetEvalCounts() const {
int ret = 0;
for (const auto& metric : train_metric_) {
......@@ -733,6 +737,20 @@ DllExport int LGBM_BoosterSaveModel(BoosterHandle handle,
API_END();
}
DllExport int LGBM_BoosterDumpModel(BoosterHandle handle,
int buffer_len,
int64_t* out_len,
char** out_str) {
API_BEGIN();
Booster* ref_booster = reinterpret_cast<Booster*>(handle);
std::string model = ref_booster->DumpModel();
*out_len = static_cast<int64_t>(model.size());
if (*out_len <= buffer_len) {
std::strcpy(*out_str, model.c_str());
}
API_END();
}
// ---- start of some help functions
std::function<std::vector<double>(int row_idx)>
......
......@@ -125,6 +125,43 @@ std::string Tree::ToString() {
return ss.str();
}
std::string Tree::ToJSON() {
std::stringstream ss;
ss << "\"num_leaves\":" << num_leaves_ << "," << std::endl;
ss << "\"tree_structure\":" << NodeToJSON(0) << std::endl;
return ss.str();
}
std::string Tree::NodeToJSON(int index) {
std::stringstream ss;
if (index >= 0) {
// non-leaf
ss << "{" << std::endl;
ss << "\"split_index\":" << index << "," << std::endl;
ss << "\"split_feature\":" << split_feature_real_.data()[index] << "," << std::endl;
ss << "\"split_gain\":" << split_gain_.data()[index] << "," << std::endl;
ss << "\"threshold\":" << threshold_.data()[index] << "," << std::endl;
ss << "\"internal_value\":" << internal_value_.data()[index] << "," << std::endl;
ss << "\"left_child\":" << NodeToJSON(left_child_.data()[index]) << "," << std::endl;
ss << "\"right_child\":" << NodeToJSON(right_child_.data()[index]) << std::endl;
ss << "}";
} else {
// leaf
index = ~index;
ss << "{" << std::endl;
ss << "\"leaf_index\":" << index << "," << std::endl;
ss << "\"leaf_parent\":" << leaf_parent_.data()[index] << "," << std::endl;
ss << "\"leaf_value\":" << leaf_value_.data()[index] << std::endl;
ss << "}";
}
return ss.str();
}
Tree::Tree(const std::string& str) {
std::vector<std::string> lines = Common::Split(str.c_str(), '\n');
std::unordered_map<std::string, std::string> key_vals;
......
# coding: utf-8
import numpy as np
from sklearn import datasets, metrics, model_selection
import lightgbm as lgb
X, Y = datasets.make_classification(n_samples=100000, n_features=100)
x_train, x_test, y_train, y_test = model_selection.train_test_split(X, Y, test_size=0.1)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment