Commit 9f4849b3 authored by Guolin Ke's avatar Guolin Ke Committed by GitHub
Browse files

Merge pull request #97 from wxchan/dev

Clean codes for python-package; dump model to JSON
parents fa51a676 69114525
......@@ -17,7 +17,7 @@ For more details, please refer to [Features](https://github.com/Microsoft/LightG
News
----
12/02/2012 : Release [python-package](https://github.com/Microsoft/LightGBM/tree/master/python-package) beta version, welcome to have a try and provide issues and feedback.
12/02/2016 : Release [python-package](https://github.com/Microsoft/LightGBM/tree/master/python-package) beta version, welcome to have a try and provide issues and feedback.
Get Started
------------
......
Python Package Example
=====================
Here is an example for LightGBM to use python package.
***You should install lightgbm (both c++ and python verion) first.***
For the installation, check the wiki [here](https://github.com/Microsoft/LightGBM/wiki/Installation-Guide).
You also need scikit-learn and pandas to run the examples, but they are not required for the package itself. You can install them with pip:
```
pip install -U scikit-learn
pip install -U pandas
```
Now you can run examples in this folder, for example:
```
python simple_example.py
```
import numpy as np
import random
# coding: utf-8
# pylint: disable = invalid-name, C0111
import json
import lightgbm as lgb
from sklearn import datasets, metrics, model_selection
rng = np.random.RandomState(2016)
X, y = datasets.make_classification(n_samples=10000, n_features=100)
x_train, x_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.1, random_state=1)
lgb_model = lgb.LGBMClassifier(n_estimators=100).fit(x_train, y_train, [(x_test, y_test)], eval_metric="auc")
lgb_model.predict(x_test)
# save model
lgb_model.booster().save_model('model.txt')
# load model
booster = lgb.Booster(model_file='model.txt')
import pandas as pd
from sklearn.metrics import mean_squared_error
# load or create your dataset
df_train = pd.read_csv('../regression/regression.train', header=None, sep='\t')
df_test = pd.read_csv('../regression/regression.test', header=None, sep='\t')
y_train = df_train[0]
y_test = df_test[0]
X_train = df_train.drop(0, axis=1)
X_test = df_test.drop(0, axis=1)
# create dataset for lightgbm
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
# or you can simply use a tuple of length=2 here
lgb_train = (X_train, y_train)
lgb_eval = (X_test, y_test)
# specify your configurations as a dict
params = {
'task' : 'train',
'boosting_type' : 'gbdt',
'objective' : 'regression',
'metric' : 'l2',
'num_leaves' : 31,
'learning_rate' : 0.05,
'feature_fraction' : 0.9,
'bagging_fraction' : 0.8,
'bagging_freq': 5,
# 'ndcg_eval_at' : [1, 3, 5, 10],
# this metric is not needed in this task, show as an example
'verbose' : 0
}
# train
gbm = lgb.train(params,
lgb_train,
num_boost_round=100,
valid_datas=lgb_eval,
# you can use a list to represent multiple valid_datas/valid_names
# don't use tuple, tuple is used to represent one dataset
early_stopping_rounds=10)
# save model to file
gbm.save_model('model.txt')
# load model from file
gbm = lgb.Booster(model_file='model.txt')
# predict
print(booster.predict(x_test))
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
# eval
print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)
# dump model to json (and save to file)
model_json = gbm.dump_model()
with open('model.json', 'w+') as f:
json.dump(model_json, f, indent=4)
# coding: utf-8
# pylint: disable = invalid-name, C0111
import lightgbm as lgb
import pandas as pd
from sklearn.metrics import mean_squared_error
# load or create your dataset
df_train = pd.read_csv('../regression/regression.train', header=None, sep='\t')
df_test = pd.read_csv('../regression/regression.test', header=None, sep='\t')
y_train = df_train[0]
y_test = df_test[0]
X_train = df_train.drop(0, axis=1)
X_test = df_test.drop(0, axis=1)
# train
gbm = lgb.LGBMRegressor(objective='regression',
num_leaves=31,
learning_rate=0.05,
n_estimators=100)
gbm.fit(X_train, y_train,
eval_set=[(X_test, y_test)],
early_stopping_rounds=10)
# predict
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
# eval
print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)
......@@ -133,9 +133,16 @@ public:
const double* feature_values) const = 0;
/*!
* \brief save model to file
* \param num_iterations Iterations that want to save, -1 means save all
* \param filename filename that want to save to
* \brief Dump model to json format string
* \return Json format string of model
*/
virtual std::string DumpModel() const = 0;
/*!
* \brief Save model to file
* \param num_used_model Number of model that want to save, -1 means save all
* \param is_finish Is training finished or not
* \param filename Filename that want to save to
*/
virtual void SaveModelToFile(int num_iterations, const char* filename) const = 0;
......
......@@ -474,7 +474,18 @@ DllExport int LGBM_BoosterSaveModel(BoosterHandle handle,
int num_iteration,
const char* filename);
/*!
* \brief dump model to json
* \param handle handle
* \param buffer_len string buffer length, if buffer_len < out_len, re-allocate buffer
* \param out_len actual output length
* \param out_str json format string of model
* \return 0 when succeed, -1 when failure happens
*/
DllExport int LGBM_BoosterDumpModel(BoosterHandle handle,
int buffer_len,
int64_t* out_len,
char** out_str);
// some help functions used to convert data
......
......@@ -98,9 +98,12 @@ public:
}
}
/*! \brief Serialize this object by string*/
/*! \brief Serialize this object to string*/
std::string ToString();
/*! \brief Serialize this object to json*/
std::string ToJSON();
private:
/*!
* \brief Find leaf index of which record belongs by data
......@@ -118,6 +121,9 @@ private:
*/
inline int GetLeaf(const double* feature_values) const;
/*! \brief Serialize one node to json*/
inline std::string NodeToJSON(int index);
/*! \brief Number of max leaves*/
int max_leaves_;
/*! \brief Number of current levas*/
......@@ -137,13 +143,13 @@ private:
std::vector<double> threshold_;
/*! \brief A non-leaf node's split gain */
std::vector<double> split_gain_;
/*! \brief Output of internal nodes(save internal output for per inference feature importance calc) */
std::vector<double> internal_value_;
// used for leaf node
/*! \brief The parent of leaf */
std::vector<int> leaf_parent_;
/*! \brief Output of leaves */
std::vector<double> leaf_value_;
/*! \brief Output of internal nodes(save internal output for per inference feature importance calc) */
std::vector<double> internal_value_;
/*! \brief Depth for leaves */
std::vector<int> leaf_depth_;
};
......
......@@ -20,4 +20,5 @@ __version__ = 0.1
__all__ = ['Dataset', 'Booster',
'train', 'cv',
'LGBMModel','LGBMRegressor', 'LGBMClassifier', 'LGBMRanker']
\ No newline at end of file
'LGBMModel', 'LGBMRegressor', 'LGBMClassifier', 'LGBMRanker']
# coding: utf-8
# pylint: disable = invalid-name, C0111, R0912, R0913, R0914, W0105
"""Wrapper c_api of LightGBM"""
from __future__ import absolute_import
......@@ -5,12 +7,24 @@ import sys
import os
import ctypes
import tempfile
import json
import numpy as np
import scipy.sparse
from .libpath import find_lib_path
# pandas
try:
from pandas import Series, DataFrame
IS_PANDAS_INSTALLED = True
except ImportError:
IS_PANDAS_INSTALLED = False
class Series(object):
pass
class DataFrame(object):
pass
IS_PY3 = (sys.version_info[0] == 3)
def _load_lib():
......@@ -69,6 +83,8 @@ def list_to_1d_numpy(data, dtype):
return data.astype(dtype=dtype, copy=False)
elif is_1d_list(data):
return np.array(data, dtype=dtype, copy=False)
elif IS_PANDAS_INSTALLED and isinstance(data, Series):
return data.astype(dtype).values
else:
raise TypeError("Unknow type({})".format(type(data).__name__))
......@@ -367,13 +383,6 @@ class Predictor(object):
raise ValueError("incorrect number for predict result")
return preds, nrow
# pandas
try:
from pandas import DataFrame
except ImportError:
class DataFrame(object):
pass
PANDAS_DTYPE_MAPPER = {'int8': 'int', 'int16': 'int', 'int32': 'int',
'int64': 'int', 'uint8': 'int', 'uint16': 'int',
'uint32': 'int', 'uint64': 'int', 'float16': 'float',
......@@ -830,6 +839,7 @@ class Booster(object):
self.__is_manage_handle = True
self.__train_data_name = "training"
self.__attr = {}
self.best_iteration = -1
params = {} if params is None else params
if silent:
params["verbose"] = 0
......@@ -1098,6 +1108,34 @@ class Booster(object):
num_iteration,
c_str(filename)))
def dump_model(self):
"""
Dump model to json format
Returns
-------
Json format of model
"""
buffer_len = 1 << 20
tmp_out_len = ctypes.c_int64(0)
string_buffer = ctypes.create_string_buffer(buffer_len)
ptr_string_buffer = ctypes.c_char_p(*[ctypes.addressof(string_buffer)])
_safe_call(_LIB.LGBM_BoosterDumpModel(
self.handle,
buffer_len,
ctypes.byref(tmp_out_len),
ctypes.byref(ptr_string_buffer)))
actual_len = tmp_out_len.value
if actual_len > buffer_len:
string_buffer = ctypes.create_string_buffer(actual_len)
ptr_string_buffer = ctypes.c_char_p(*[ctypes.addressof(string_buffer)])
_safe_call(_LIB.LGBM_BoosterDumpModel(
self.handle,
actual_len,
ctypes.byref(tmp_out_len),
ctypes.byref(ptr_string_buffer)))
return json.loads(string_buffer.value.decode())
def predict(self, data, num_iteration=-1, raw_score=False, pred_leaf=False, data_has_header=False, is_reshape=True):
"""
Predict logic
......@@ -1190,7 +1228,7 @@ class Booster(object):
ctypes.byref(tmp_out_len),
data_ptr))
if tmp_out_len.value != len(self.__inner_predict_buffer[data_idx]):
raise ValueError("incorrect number of predict results for data %d" % (data_idx) )
raise ValueError("incorrect number of predict results for data %d" % (data_idx))
self.__is_predicted_cur_iter[data_idx] = True
return self.__inner_predict_buffer[data_idx]
......
# coding: utf-8
# pylint: disable = invalid-name, W0105
from __future__ import absolute_import
import collections
......@@ -25,12 +27,12 @@ CallbackEnv = collections.namedtuple(
def _format_eval_result(value, show_stdv=True):
"""format metric string"""
if len(value) == 4:
return '%s_%s:%g' % (value[0], value[1], value[2])
return '%s\'s %s:%g' % (value[0], value[1], value[2])
elif len(value) == 5:
if show_stdv:
return '%s_%s:%g+%g' % (value[0], value[1], value[2], value[4])
return '%s\'s %s:%g+%g' % (value[0], value[1], value[2], value[4])
else:
return '%s_%s:%g' % (value[0], value[1], value[2])
return '%s\'s %s:%g' % (value[0], value[1], value[2])
else:
raise ValueError("wrong metric value")
......@@ -55,9 +57,10 @@ def print_evaluation(period=1, show_stdv=True):
"""internal function"""
if len(env.evaluation_result_list) == 0 or period is False:
return
if (env.iteration % period == 0 or env.iteration + 1 == env.begin_iteration):
result = '\t'.join([_format_eval_result(x, show_stdv) for x in env.evaluation_result_list])
print('[%d]\t%s\n' % (env.iteration, result))
if env.iteration % period == 0 or env.iteration + 1 == env.begin_iteration:
result = '\t'.join([_format_eval_result(x, show_stdv) \
for x in env.evaluation_result_list])
print('[%d]\t%s' % (env.iteration, result))
return callback
......@@ -137,7 +140,7 @@ def early_stop(stopping_rounds, verbose=True):
Parameters
----------
stopp_rounds : int
stopping_rounds : int
The stopping rounds before the trend occur.
verbose : optional, bool
......@@ -154,13 +157,11 @@ def early_stop(stopping_rounds, verbose=True):
best_msg = {}
def init(env):
"""internal function"""
bst = env.model
if len(env.evaluation_result_list) == 0:
raise ValueError('For early stopping you need at least one set in evals.')
if verbose:
msg = "Will train until hasn't improved in {} rounds.\n"
msg = "Train until valid scores didn't improve in {} rounds."
print(msg.format(stopping_rounds))
for i in range(len(env.evaluation_result_list)):
......@@ -182,13 +183,13 @@ def early_stop(stopping_rounds, verbose=True):
best_score[i] = score
best_iter[i] = env.iteration
if verbose:
best_msg[i] = '[%d]\t%s' % ( env.iteration,
best_msg[i] = '[%d]\t%s' % (env.iteration, \
'\t'.join([_format_eval_result(x) for x in env.evaluation_result_list]))
else:
if env.iteration - best_iter[i] >= stopping_rounds:
if env.model is not None:
env.model.set_attr(best_iteration=str(best_iter[i]))
if verbose:
print('early stopping, best message is:\n {} '.format(best_msg[i]))
print('early stopping, best iteration is:\n{}'.format(best_msg[i]))
raise EarlyStopException(best_iter[i])
return callback
# coding: utf-8
# pylint: disable = invalid-name, W0105
"""Training Library containing training routines of LightGBM."""
from __future__ import absolute_import
......@@ -33,7 +35,6 @@ def _construct_dataset(X_y, reference=None,
ret = Dataset(data, label=label, max_bin=max_bin,
weight=weight, group=group,
predictor=predictor, params=params)
else:
ret = reference.create_valid(data, label=label, weight=weight,
group=group, params=params)
......@@ -53,11 +54,11 @@ def train(params, train_data, num_boost_round=100,
----------
params : dict
params.
train_data : pair, (X, y) or filename of data
train_data : Dataset, tuple (X, y) or filename of data
Data to be trained.
num_boost_round: int
Number of boosting iterations.
valid_datas: list of pairs (valid_X, valid_y) or filename of data
valid_datas: list of Datasets, tuples (valid_X, valid_y) or filename of data
List of data to be evaluated during training
valid_names: list of string
names of valid_datas
......@@ -72,7 +73,8 @@ def train(params, train_data, num_boost_round=100,
other data file in training data. e.g. train_fields['weight'] is weight data
support fields: weight, group, init_score
valid_fields : dict
other data file in training data. e.g. valid_fields[0]['weight'] is weight data for first valid data
other data file in training data. \
e.g. valid_fields[0]['weight'] is weight data for first valid data
support fields: weight, group, init_score
early_stopping_rounds: int
Activates early stopping.
......@@ -82,8 +84,8 @@ def train(params, train_data, num_boost_round=100,
If early stopping occurs, the model will add 'best_iteration' field
evals_result: dict or None
This dictionary used to store all evaluation results of all the items in valid_datas.
Example: with a valid_datas containing [valid_set, train_set] and valid_names containing ['eval', 'train'] and
a paramater containing ('metric':'logloss')
Example: with a valid_datas containing [valid_set, train_set] \
and valid_names containing ['eval', 'train'] and a paramater containing ('metric':'logloss')
Returns: {'train': {'logloss': ['0.48253', '0.35953', ...]},
'eval': {'logloss': ['0.480385', '0.357756', ...]}}
passed with None means no using this function
......@@ -120,22 +122,32 @@ def train(params, train_data, num_boost_round=100,
else:
predictor = None
"""create dataset"""
if isinstance(train_data, Dataset):
train_set = train_data
else:
train_set = _construct_dataset(train_data, None, params, train_fields, predictor)
is_valid_contain_train = False
train_data_name = "training"
valid_sets = []
name_valid_sets = []
if valid_datas is not None:
for i in range(len(valid_datas)):
if isinstance(valid_datas, (Dataset, tuple)):
valid_datas = [valid_datas]
if isinstance(valid_names, str):
valid_names = [valid_names]
for i, valid_data in enumerate(valid_datas):
other_fields = None if valid_fields is None else valid_fields[i]
"""reduce cost for prediction training data"""
if valid_datas[i] is train_data:
if valid_data is train_data:
is_valid_contain_train = True
if valid_names is not None:
train_data_name = valid_names[i]
continue
if isinstance(valid_data, Dataset):
valid_set = valid_data
else:
valid_set = _construct_dataset(
valid_datas[i],
valid_data,
train_set,
params,
other_fields,
......@@ -178,8 +190,8 @@ def train(params, train_data, num_boost_round=100,
booster = Booster(params=params, train_set=train_set)
if is_valid_contain_train:
booster.set_train_data_name(train_data_name)
for i in range(len(valid_sets)):
booster.add_valid(valid_sets[i], name_valid_sets[i])
for valid_set, name_valid_set in zip(valid_sets, name_valid_sets):
booster.add_valid(valid_set, name_valid_set)
"""start training"""
for i in range(num_boost_round):
for cb in callbacks_before_iter:
......@@ -209,9 +221,9 @@ def train(params, train_data, num_boost_round=100,
except callback.EarlyStopException:
break
if booster.attr('best_iteration') is not None:
booster.best_iteration = int(booster.attr('best_iteration'))
booster.best_iteration = int(booster.attr('best_iteration')) + 1
else:
booster.best_iteration = num_boost_round - 1
booster.best_iteration = num_boost_round
return booster
......@@ -233,12 +245,13 @@ class CVBooster(object):
return self.booster.eval_valid(feval)
try:
try:
from sklearn.model_selection import KFold, StratifiedKFold
except ImportError:
from sklearn.cross_validation import KFold, StratifiedKFold
from sklearn.model_selection import StratifiedKFold
SKLEARN_StratifiedKFold = True
except ImportError:
try:
from sklearn.cross_validation import StratifiedKFold
SKLEARN_StratifiedKFold = True
except ImportError:
SKLEARN_StratifiedKFold = False
def _make_n_folds(full_data, nfold, param, seed, fpreproc=None, stratified=False):
......@@ -270,7 +283,6 @@ def _make_n_folds(full_data, nfold, param, seed, fpreproc=None, stratified=False
return ret
def _agg_cv_result(raw_results):
# pylint: disable=invalid-name
"""
Aggregate cross-validation results.
"""
......@@ -294,7 +306,6 @@ def cv(params, train_data, num_boost_round=10, nfold=5, stratified=False,
metrics=(), fobj=None, feval=None, train_fields=None, early_stopping_rounds=None,
fpreproc=None, verbose_eval=None, show_stdv=True, seed=0,
callbacks=None):
# pylint: disable = invalid-name
"""Cross-validation with given paramaters.
Parameters
......@@ -351,7 +362,7 @@ def cv(params, train_data, num_boost_round=10, nfold=5, stratified=False,
if isinstance(params, list):
params = dict(params)
if not 'metric' in params:
if 'metric' not in params:
params['metric'] = []
else:
if is_str(params['metric']):
......@@ -410,7 +421,7 @@ def cv(params, train_data, num_boost_round=10, nfold=5, stratified=False,
end_iteration=num_boost_round,
evaluation_result_list=res))
except callback.EarlyStopException as e:
for k in results.keys():
for k in results:
results[k] = results[k][:(e.best_iteration + 1)]
break
return results
# coding: utf-8
"""Find the path to lightgbm dynamic library files."""
import os
import platform
import sys
......
# coding: utf-8
# pylint: disable = invalid-name, W0105
"""Scikit-Learn Wrapper interface for LightGBM."""
from __future__ import absolute_import
......@@ -220,7 +222,8 @@ class LGBMModel(LGBMModelBase):
other data file in training data. e.g. train_fields['weight'] is weight data
support fields: weight, group, init_score
valid_fields : dict
other data file in training data. e.g. valid_fields[0]['weight'] is weight data for first valid data
other data file in training data. \
e.g. valid_fields[0]['weight'] is weight data for first valid data
support fields: weight, group, init_score
other_params: dict
other parameters
......@@ -235,6 +238,13 @@ class LGBMModel(LGBMModelBase):
params["objective"] = "None"
else:
params["objective"] = self.objective
if eval_metric is None and eval_set is not None:
eval_metric = {
'regression': 'l2',
'binary': 'binary_logloss',
'lambdarank': 'ndcg',
'multiclass': 'multi_logloss'
}.get(self.objective, None)
if callable(eval_metric):
feval = eval_metric
......@@ -249,7 +259,8 @@ class LGBMModel(LGBMModelBase):
self.n_estimators, valid_datas=eval_set,
early_stopping_rounds=early_stopping_rounds,
evals_result=evals_result, fobj=self.fobj, feval=feval,
verbose_eval=verbose, train_fields=train_fields, valid_fields=valid_fields)
verbose_eval=verbose, train_fields=train_fields,
valid_fields=valid_fields)
if evals_result:
for val in evals_result.items():
......@@ -320,14 +331,18 @@ class LGBMClassifier(LGBMModel, LGBMClassifierBase):
# Switch to using a multiclass objective in the underlying LGBM instance
self.objective = "multiclass"
other_params['num_class'] = self.n_classes_
if eval_metric is None and eval_set is not None:
eval_metric = "multi_logloss"
else:
self.objective = "binary"
if eval_metric is None and eval_set is not None:
eval_metric = "binary_logloss"
self._le = LGBMLabelEncoder().fit(y)
training_labels = self._le.transform(y)
if eval_set is not None:
eval_set = list( (x[0], self._le.transform(x[1])) for x in eval_set )
eval_set = list((x[0], self._le.transform(x[1])) for x in eval_set)
super(LGBMClassifier, self).fit(X, training_labels, eval_set,
eval_metric, early_stopping_rounds,
......@@ -430,6 +445,8 @@ class LGBMRanker(LGBMModel):
else:
self.objective = "lambdarank"
self.fobj = None
if eval_metric is None and eval_set is not None:
eval_metric = "ndcg"
super(LGBMRanker, self).fit(X, y, eval_set, eval_metric,
early_stopping_rounds, verbose,
......
# coding: utf-8
# pylint: disable=invalid-name, exec-used
"""Setup lightgbm package."""
from __future__ import absolute_import
......
......@@ -393,6 +393,36 @@ void GBDT::Boosting() {
GetGradients(GetTrainingScore(&num_score), gradients_.data(), hessians_.data());
}
std::string GBDT::DumpModel() const {
std::stringstream ss;
ss << "{";
ss << "\"name\":\"" << Name() << "\"," << std::endl;
ss << "\"num_class\":" << num_class_ << "," << std::endl;
ss << "\"label_index\":" << label_idx_ << "," << std::endl;
ss << "\"max_feature_idx\":" << max_feature_idx_ << "," << std::endl;
if (object_function_ != nullptr) {
ss << "\"objective\":\"" << object_function_->GetName() << "\"," << std::endl;
}
ss << "\"sigmoid\":" << sigmoid_ << "," << std::endl;
ss << "\"tree_info\":[";
for (int i = 0; i < static_cast<int>(models_.size()); ++i) {
if (i > 0) {
ss << ",";
}
ss << "{";
ss << "\"tree_index\":" << i << ",";
ss << models_[i]->ToJSON();
ss << "}";
}
ss << "]" << std::endl;
ss << "}" << std::endl;
return ss.str();
}
void GBDT::SaveModelToFile(int num_iteration, const char* filename) const {
/*! \brief File to write models */
std::ofstream output_file;
......@@ -426,7 +456,11 @@ void GBDT::SaveModelToFile(int num_iteration, const char* filename) const {
output_file << models_[i]->ToString() << std::endl;
}
output_file << std::endl << FeatureImportance() << std::endl;
std::vector<std::pair<size_t, std::string>> pairs = FeatureImportance();
output_file << std::endl << "feature importances:" << std::endl;
for (size_t i = 0; i < pairs.size(); ++i) {
output_file << pairs[i].second << "=" << std::to_string(pairs[i].first) << std::endl;
}
output_file.close();
}
......@@ -487,7 +521,7 @@ void GBDT::LoadModelFromString(const std::string& model_str) {
num_init_iteration_ = num_iteration_for_pred_;
}
std::string GBDT::FeatureImportance() const {
std::vector<std::pair<size_t, std::string>> GBDT::FeatureImportance() const {
std::vector<size_t> feature_importances(max_feature_idx_ + 1, 0);
for (size_t iter = 0; iter < models_.size(); ++iter) {
for (int split_idx = 0; split_idx < models_[iter]->num_leaves() - 1; ++split_idx) {
......@@ -507,13 +541,7 @@ std::string GBDT::FeatureImportance() const {
const std::pair<size_t, std::string>& rhs) {
return lhs.first > rhs.first;
});
std::stringstream str_buf;
// write to model file
str_buf << std::endl << "feature importances:" << std::endl;
for (size_t i = 0; i < pairs.size(); ++i) {
str_buf << pairs[i].second << "=" << std::to_string(pairs[i].first) << std::endl;
}
return str_buf.str();
return pairs;
}
std::vector<double> GBDT::PredictRaw(const double* value) const {
......
......@@ -145,9 +145,16 @@ public:
std::vector<int> PredictLeafIndex(const double* value) const override;
/*!
* \brief save model to file
* \param num_iterations Iterations that want to save, -1 means save all
* \param filename filename that want to save to
* \brief Dump model to json format string
* \return Json format string of model
*/
std::string DumpModel() const override;
/*!
* \brief Save model to file
* \param num_used_model Number of model that want to save, -1 means save all
* \param is_finish Is training finished or not
* \param filename Filename that want to save to
*/
virtual void SaveModelToFile(int num_iterations, const char* filename) const override ;
......@@ -155,6 +162,7 @@ public:
* \brief Restore from a serialized string
*/
void LoadModelFromString(const std::string& model_str) override;
/*!
* \brief Get max feature index of this model
* \return Max feature index of this model
......@@ -231,7 +239,7 @@ protected:
* \brief Calculate feature importances
* \param last_iter Last tree use to calculate
*/
std::string FeatureImportance() const;
std::vector<std::pair<size_t, std::string>> FeatureImportance() const;
/*! \brief current iteration */
int iter_;
/*! \brief Pointer to training data */
......
......@@ -139,6 +139,10 @@ public:
boosting_->SaveModelToFile(num_iteration, filename);
}
std::string DumpModel() {
return boosting_->DumpModel();
}
int GetEvalCounts() const {
int ret = 0;
for (const auto& metric : train_metric_) {
......@@ -733,6 +737,20 @@ DllExport int LGBM_BoosterSaveModel(BoosterHandle handle,
API_END();
}
DllExport int LGBM_BoosterDumpModel(BoosterHandle handle,
int buffer_len,
int64_t* out_len,
char** out_str) {
API_BEGIN();
Booster* ref_booster = reinterpret_cast<Booster*>(handle);
std::string model = ref_booster->DumpModel();
*out_len = static_cast<int64_t>(model.size());
if (*out_len <= buffer_len) {
std::strcpy(*out_str, model.c_str());
}
API_END();
}
// ---- start of some help functions
std::function<std::vector<double>(int row_idx)>
......
......@@ -125,6 +125,43 @@ std::string Tree::ToString() {
return ss.str();
}
std::string Tree::ToJSON() {
std::stringstream ss;
ss << "\"num_leaves\":" << num_leaves_ << "," << std::endl;
ss << "\"tree_structure\":" << NodeToJSON(0) << std::endl;
return ss.str();
}
std::string Tree::NodeToJSON(int index) {
std::stringstream ss;
if (index >= 0) {
// non-leaf
ss << "{" << std::endl;
ss << "\"split_index\":" << index << "," << std::endl;
ss << "\"split_feature\":" << split_feature_real_.data()[index] << "," << std::endl;
ss << "\"split_gain\":" << split_gain_.data()[index] << "," << std::endl;
ss << "\"threshold\":" << threshold_.data()[index] << "," << std::endl;
ss << "\"internal_value\":" << internal_value_.data()[index] << "," << std::endl;
ss << "\"left_child\":" << NodeToJSON(left_child_.data()[index]) << "," << std::endl;
ss << "\"right_child\":" << NodeToJSON(right_child_.data()[index]) << std::endl;
ss << "}";
} else {
// leaf
index = ~index;
ss << "{" << std::endl;
ss << "\"leaf_index\":" << index << "," << std::endl;
ss << "\"leaf_parent\":" << leaf_parent_.data()[index] << "," << std::endl;
ss << "\"leaf_value\":" << leaf_value_.data()[index] << std::endl;
ss << "}";
}
return ss.str();
}
Tree::Tree(const std::string& str) {
std::vector<std::string> lines = Common::Split(str.c_str(), '\n');
std::unordered_map<std::string, std::string> key_vals;
......
# coding: utf-8
import numpy as np
from sklearn import datasets, metrics, model_selection
import lightgbm as lgb
X, Y = datasets.make_classification(n_samples=100000, n_features=100)
x_train, x_test, y_train, y_test = model_selection.train_test_split(X, Y, test_size=0.1)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment