Commit eba6d200 authored by wxchan's avatar wxchan
Browse files

Squash into one commit:

1. merge python-package
2. add dump model to json
3. fix bugs
4. clean code with pylint
5. update python examples
parent 19e085c9
# coding: utf-8
"""Find the path to lightgbm dynamic library files."""
import os
import sys
def find_lib_path():
"""Find the path to LightGBM library files.
Returns
-------
lib_path: list(string)
List of all found library path to LightGBM
"""
curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
dll_path = [curr_path, os.path.join(curr_path, '../../lib/'),
os.path.join(curr_path, '../../'),
os.path.join(curr_path, './lib/'),
os.path.join(sys.prefix, 'lightgbm')]
if os.name == 'nt':
dll_path.append(os.path.join(curr_path, '../../windows/x64/Dll/'))
dll_path.append(os.path.join(curr_path, './windows/x64/Dll/'))
dll_path = [os.path.join(p, 'lib_lightgbm.dll') for p in dll_path]
else:
dll_path = [os.path.join(p, 'lib_lightgbm.so') for p in dll_path]
lib_path = [p for p in dll_path if os.path.exists(p) and os.path.isfile(p)]
if not lib_path:
raise Exception('Cannot find lightgbm Library')
return lib_path
# coding: utf-8
# pylint: disable = invalid-name, W0105
"""Scikit-Learn Wrapper interface for LightGBM."""
from __future__ import absolute_import
import numpy as np
from .basic import LightGBMError, Predictor, Dataset, Booster, is_str
from .engine import train
# sklearn
try:
from sklearn.base import BaseEstimator
from sklearn.base import RegressorMixin, ClassifierMixin
from sklearn.preprocessing import LabelEncoder
SKLEARN_INSTALLED = True
LGBMModelBase = BaseEstimator
LGBMRegressorBase = RegressorMixin
LGBMClassifierBase = ClassifierMixin
LGBMLabelEncoder = LabelEncoder
except ImportError:
SKLEARN_INSTALLED = False
LGBMModelBase = object
LGBMClassifierBase = object
LGBMRegressorBase = object
LGBMLabelEncoder = None
def _point_wise_objective(func):
"""Decorate an objective function
Note: for multi-class task, the y_pred is group by class_id first, then group by row_id
if you want to get i-th row y_pred in j-th class, the access way is y_pred[j*num_data+i]
and you should group grad and hess in this way as well
Parameters
----------
func: callable
Expects a callable with signature ``func(y_true, y_pred)``:
y_true: array_like of shape [n_samples]
The target values
y_pred: array_like of shape [n_samples] or shape[n_samples* n_class] (for multi-class)
The predicted values
Returns
-------
new_func: callable
The new objective function as expected by ``lightgbm.engine.train``.
The signature is ``new_func(preds, dataset)``:
preds: array_like, shape [n_samples] or shape[n_samples* n_class]
The predicted values
dataset: ``dataset``
The training set from which the labels will be extracted using
``dataset.get_label()``
"""
def inner(preds, dataset):
"""internal function"""
labels = dataset.get_label()
grad, hess = func(labels, preds)
"""weighted for objective"""
weight = dataset.get_weight()
if weight is not None:
"""only one class"""
if len(weight) == len(grad):
grad = np.multiply(grad, weight)
hess = np.multiply(hess, weight)
else:
num_data = len(weight)
num_class = len(grad) // num_data
if num_class * num_data != len(grad):
raise ValueError("length of grad and hess should equal with num_class * num_data")
for k in range(num_class):
for i in range(num_data):
idx = k * num_data + i
grad[idx] *= weight[i]
hess[idx] *= weight[i]
return grad, hess
return inner
class LGBMModel(LGBMModelBase):
"""Implementation of the Scikit-Learn API for LightGBM.
Parameters
----------
num_leaves : int
Maximum tree leaves for base learners.
max_depth : int
Maximum tree depth for base learners, -1 means no limit.
learning_rate : float
Boosting learning rate
n_estimators : int
Number of boosted trees to fit.
silent : boolean
Whether to print messages while running boosting.
objective : string or callable
Specify the learning task and the corresponding learning objective or
a custom objective function to be used (see note below).
nthread : int
Number of parallel threads
min_split_gain : float
Minimum loss reduction required to make a further partition on a leaf node of the tree.
min_child_weight : int
Minimum sum of instance weight(hessian) needed in a child(leaf)
min_child_samples : int
Minimum number of data need in a child(leaf)
subsample : float
Subsample ratio of the training instance.
subsample_freq : int
frequence of subsample, <=0 means no enable
colsample_bytree : float
Subsample ratio of columns when constructing each tree.
reg_alpha : float
L1 regularization term on weights
reg_lambda : float
L2 regularization term on weights
scale_pos_weight : float
Balancing of positive and negative weights.
is_unbalance : bool
Is unbalance for binary classification
seed : int
Random number seed.
Note
----
A custom objective function can be provided for the ``objective``
parameter. In this case, it should have the signature
``objective(y_true, y_pred) -> grad, hess``:
y_true: array_like of shape [n_samples]
The target values
y_pred: array_like of shape [n_samples] or shape[n_samples* n_class]
The predicted values
grad: array_like of shape [n_samples] or shape[n_samples* n_class]
The value of the gradient for each sample point.
hess: array_like of shape [n_samples] or shape[n_samples* n_class]
The value of the second derivative for each sample point
for multi-class task, the y_pred is group by class_id first, then group by row_id
if you want to get i-th row y_pred in j-th class, the access way is y_pred[j*num_data+i]
and you should group grad and hess in this way as well
"""
def __init__(self, num_leaves=31, max_depth=-1,
learning_rate=0.1, n_estimators=10, max_bin=255,
silent=True, objective="regression",
nthread=-1, min_split_gain=0, min_child_weight=5, min_child_samples=10,
subsample=1, subsample_freq=1, colsample_bytree=1,
reg_alpha=0, reg_lambda=0, scale_pos_weight=1,
is_unbalance=False, seed=0):
if not SKLEARN_INSTALLED:
raise LightGBMError('sklearn needs to be installed in order to use this module')
self.num_leaves = num_leaves
self.max_depth = max_depth
self.learning_rate = learning_rate
self.n_estimators = n_estimators
self.max_bin = max_bin
self.silent = silent
self.objective = objective
self.nthread = nthread
self.min_split_gain = min_split_gain
self.min_child_weight = min_child_weight
self.min_child_samples = min_child_samples
self.subsample = subsample
self.subsample_freq = subsample_freq
self.colsample_bytree = colsample_bytree
self.reg_alpha = reg_alpha
self.reg_lambda = reg_lambda
self.scale_pos_weight = scale_pos_weight
self.is_unbalance = is_unbalance
self.seed = seed
self._Booster = None
if callable(self.objective):
self.fobj = _point_wise_objective(self.objective)
else:
self.fobj = None
def booster(self):
"""Get the underlying lightgbm Booster of this model.
This will raise an exception when fit was not called
Returns
-------
booster : a lightgbm booster of underlying model
"""
if self._Booster is None:
raise LightGBMError('need to call fit beforehand')
return self._Booster
def get_params(self, deep=False):
"""Get parameters"""
params = super(LGBMModel, self).get_params(deep=deep)
params['verbose'] = 0 if self.silent else 1
if self.nthread <= 0:
params.pop('nthread', None)
return params
def fit(self, X, y, eval_set=None, eval_metric=None,
early_stopping_rounds=None, verbose=True,
train_fields=None, valid_fields=None, other_params=None):
"""
Fit the gradient boosting model
Parameters
----------
X : array_like
Feature matrix
y : array_like
Labels
eval_set : list, optional
A list of (X, y) tuple pairs to use as a validation set for early-stopping
eval_metric : str, list of str, callable, optional
If a str, should be a built-in evaluation metric to use.
If callable, a custom evaluation metric. The call
signature is func(y_predicted, dataset) where dataset will be a
Dataset fobject such that you may need to call the get_label
method. And it must return (eval_name->str, eval_result->float, is_bigger_better->Bool)
early_stopping_rounds : int
verbose : bool
If `verbose` and an evaluation set is used, writes the evaluation
train_fields : dict
other data file in training data. e.g. train_fields['weight'] is weight data
support fields: weight, group, init_score
valid_fields : dict
other data file in training data. \
e.g. valid_fields[0]['weight'] is weight data for first valid data
support fields: weight, group, init_score
other_params: dict
other parameters
"""
evals_result = {}
params = self.get_params()
if other_params is not None:
params.update(other_params)
if self.fobj:
params["objective"] = "None"
else:
params["objective"] = self.objective
if eval_metric is None and eval_set is not None:
eval_metric = {
'regression': 'l2',
'binary': 'binary_logloss',
'lambdarank': 'ndcg',
'multiclass': 'multi_logloss'
}.get(self.objective, None)
if callable(eval_metric):
feval = eval_metric
elif is_str(eval_metric) or isinstance(eval_metric, list):
feval = None
params.update({'metric': eval_metric})
else:
feval = None
feval = eval_metric if callable(eval_metric) else None
self._Booster = train(params, (X, y),
self.n_estimators, valid_datas=eval_set,
early_stopping_rounds=early_stopping_rounds,
evals_result=evals_result, fobj=self.fobj, feval=feval,
verbose_eval=verbose, train_fields=train_fields,
valid_fields=valid_fields)
if evals_result:
for val in evals_result.items():
evals_result_key = list(val[1].keys())[0]
evals_result[val[0]][evals_result_key] = val[1][evals_result_key]
self.evals_result_ = evals_result
if early_stopping_rounds is not None:
self.best_iteration = self._Booster.best_iteration
return self
def predict(self, data, raw_score=False, num_iteration=0):
return self.booster().predict(data,
raw_score=raw_score,
num_iteration=num_iteration)
def apply(self, X, num_iteration=0):
"""Return the predicted leaf every tree for each sample.
Parameters
----------
X : array_like, shape=[n_samples, n_features]
Input features matrix.
ntree_limit : int
Limit number of trees in the prediction; defaults to 0 (use all trees).
Returns
-------
X_leaves : array_like, shape=[n_samples, n_trees]
"""
return self.booster().predict(X,
pred_leaf=True,
num_iteration=num_iteration)
def evals_result(self):
"""Return the evaluation results.
Returns
-------
evals_result : dictionary
"""
if self.evals_result_:
evals_result = self.evals_result_
else:
raise LightGBMError('No results.')
return evals_result
class LGBMRegressor(LGBMModel, LGBMRegressorBase):
__doc__ = """Implementation of the scikit-learn API for LightGBM regression.
""" + '\n'.join(LGBMModel.__doc__.split('\n')[2:])
class LGBMClassifier(LGBMModel, LGBMClassifierBase):
__doc__ = """Implementation of the scikit-learn API for LightGBM classification.
""" + '\n'.join(LGBMModel.__doc__.split('\n')[2:])
def fit(self, X, y, eval_set=None, eval_metric=None,
early_stopping_rounds=None, verbose=True,
train_fields=None, valid_fields=None, other_params=None):
self.classes_ = np.unique(y)
self.n_classes_ = len(self.classes_)
if other_params is None:
other_params = {}
if self.n_classes_ > 2:
# Switch to using a multiclass objective in the underlying LGBM instance
self.objective = "multiclass"
other_params['num_class'] = self.n_classes_
if eval_metric is None and eval_set is not None:
eval_metric = "multi_logloss"
else:
self.objective = "binary"
if eval_metric is None and eval_set is not None:
eval_metric = "binary_logloss"
self._le = LGBMLabelEncoder().fit(y)
training_labels = self._le.transform(y)
if eval_set is not None:
eval_set = list((x[0], self._le.transform(x[1])) for x in eval_set)
super(LGBMClassifier, self).fit(X, training_labels, eval_set,
eval_metric, early_stopping_rounds,
verbose, train_fields, valid_fields,
other_params)
return self
def predict(self, data, raw_score=False, num_iteration=0):
class_probs = self.booster().predict(data,
raw_score=raw_score,
num_iteration=num_iteration)
if len(class_probs.shape) > 1:
column_indexes = np.argmax(class_probs, axis=1)
else:
column_indexes = np.repeat(0, class_probs.shape[0])
column_indexes[class_probs > 0.5] = 1
return self._le.inverse_transform(column_indexes)
def predict_proba(self, data, raw_score=False, num_iteration=0):
class_probs = self.booster().predict(data,
raw_score=raw_score,
num_iteration=num_iteration)
if self.n_classes_ > 2:
return class_probs
else:
classone_probs = class_probs
classzero_probs = 1.0 - classone_probs
return np.vstack((classzero_probs, classone_probs)).transpose()
def _group_wise_objective(func):
"""Decorate an objective function
Parameters
----------
func: callable
Expects a callable with signature ``func(y_true, group, y_pred)``:
y_true: array_like of shape [n_samples]
The target values
group : array_like of shape
group size data of data
y_pred: array_like of shape [n_samples] or shape[n_samples* n_class] (for multi-class)
The predicted values
Returns
-------
new_func: callable
The new objective function as expected by ``lightgbm.engine.train``.
The signature is ``new_func(preds, dataset)``:
preds: array_like, shape [n_samples] or shape[n_samples* n_class]
The predicted values
dataset: ``dataset``
The training set from which the labels will be extracted using
``dataset.get_label()``
"""
def inner(preds, dataset):
"""internal function"""
labels = dataset.get_label()
group = dataset.get_group()
if group is None:
raise ValueError("group should not be None for ranking task")
grad, hess = func(labels, group, preds)
"""weighted for objective"""
weight = dataset.get_weight()
if weight is not None:
"""only one class"""
if len(weight) == len(grad):
grad = np.multiply(grad, weight)
hess = np.multiply(hess, weight)
else:
raise ValueError("lenght of grad and hess should equal with num_data")
return grad, hess
return inner
class LGBMRanker(LGBMModel):
__doc__ = """Implementation of the scikit-learn API for LightGBM ranking application.
""" + '\n'.join(LGBMModel.__doc__.split('\n')[2:])
def fit(self, X, y, eval_set=None, eval_metric=None,
early_stopping_rounds=None, verbose=True,
train_fields=None, valid_fields=None, other_params=None):
"""check group data"""
if "group" not in train_fields:
raise ValueError("should set group in train_fields for ranking task")
if eval_set is not None:
if valid_fields is None:
raise ValueError("valid_fields cannot be None when eval_set is not None")
elif len(valid_fields) != len(eval_set):
raise ValueError("lenght of valid_fields should equal with eval_set")
else:
for inner in valid_fields:
if "group" not in inner:
raise ValueError("should set group in valid_fields for ranking task")
if callable(self.objective):
self.fobj = _group_wise_objective(self.objective)
else:
self.objective = "lambdarank"
self.fobj = None
if eval_metric is None and eval_set is not None:
eval_metric = "ndcg"
super(LGBMRanker, self).fit(X, y, eval_set, eval_metric,
early_stopping_rounds, verbose,
train_fields, valid_fields,
other_params)
return self
# coding: utf-8
# pylint: disable=invalid-name, exec-used
"""Setup lightgbm package."""
from __future__ import absolute_import
import sys
import os
from setuptools import setup, find_packages
# import subprocess
sys.path.insert(0, '.')
CURRENT_DIR = os.path.dirname(__file__)
libpath_py = os.path.join(CURRENT_DIR, 'lightgbm/libpath.py')
libpath = {'__file__': libpath_py}
exec(compile(open(libpath_py, "rb").read(), libpath_py, 'exec'), libpath, libpath)
LIB_PATH = libpath['find_lib_path']()
print("Install lib_lightgbm from: %s" % LIB_PATH)
setup(name='lightgbm',
version=0.1,
description="LightGBM Python Package",
install_requires=[
'numpy',
'scipy',
],
maintainer='Guolin Ke',
maintainer_email='guolin.ke@microsoft.com',
zip_safe=False,
packages=find_packages(),
include_package_data=True,
data_files=[('lightgbm', LIB_PATH)],
url='https://github.com/Microsoft/LightGBM')
...@@ -108,7 +108,7 @@ void Application::LoadData() { ...@@ -108,7 +108,7 @@ void Application::LoadData() {
// prediction is needed if using input initial model(continued train) // prediction is needed if using input initial model(continued train)
PredictFunction predict_fun = nullptr; PredictFunction predict_fun = nullptr;
// need to continue training // need to continue training
if (boosting_->NumberOfSubModels() > 0) { if (boosting_->NumberOfTotalModel() > 0) {
Predictor predictor(boosting_.get(), true, false); Predictor predictor(boosting_.get(), true, false);
predict_fun = predictor.GetPredictFunction(); predict_fun = predictor.GetPredictFunction();
} }
...@@ -139,12 +139,16 @@ void Application::LoadData() { ...@@ -139,12 +139,16 @@ void Application::LoadData() {
for (auto metric_type : config_.metric_types) { for (auto metric_type : config_.metric_types) {
auto metric = std::unique_ptr<Metric>(Metric::CreateMetric(metric_type, config_.metric_config)); auto metric = std::unique_ptr<Metric>(Metric::CreateMetric(metric_type, config_.metric_config));
if (metric == nullptr) { continue; } if (metric == nullptr) { continue; }
metric->Init("training", train_data_->metadata(), metric->Init(train_data_->metadata(), train_data_->num_data());
train_data_->num_data());
train_metric_.push_back(std::move(metric)); train_metric_.push_back(std::move(metric));
} }
} }
train_metric_.shrink_to_fit(); train_metric_.shrink_to_fit();
if (config_.metric_types.size() > 0) {
// only when have metrics then need to construct validation data
// Add validation data, if it exists // Add validation data, if it exists
for (size_t i = 0; i < config_.io_config.valid_data_filenames.size(); ++i) { for (size_t i = 0; i < config_.io_config.valid_data_filenames.size(); ++i) {
// add // add
...@@ -164,8 +168,7 @@ void Application::LoadData() { ...@@ -164,8 +168,7 @@ void Application::LoadData() {
for (auto metric_type : config_.metric_types) { for (auto metric_type : config_.metric_types) {
auto metric = std::unique_ptr<Metric>(Metric::CreateMetric(metric_type, config_.metric_config)); auto metric = std::unique_ptr<Metric>(Metric::CreateMetric(metric_type, config_.metric_config));
if (metric == nullptr) { continue; } if (metric == nullptr) { continue; }
metric->Init(config_.io_config.valid_data_filenames[i].c_str(), metric->Init(valid_datas_.back()->metadata(),
valid_datas_.back()->metadata(),
valid_datas_.back()->num_data()); valid_datas_.back()->num_data());
valid_metrics_.back().push_back(std::move(metric)); valid_metrics_.back().push_back(std::move(metric));
} }
...@@ -173,6 +176,7 @@ void Application::LoadData() { ...@@ -173,6 +176,7 @@ void Application::LoadData() {
} }
valid_datas_.shrink_to_fit(); valid_datas_.shrink_to_fit();
valid_metrics_.shrink_to_fit(); valid_metrics_.shrink_to_fit();
}
auto end_time = std::chrono::high_resolution_clock::now(); auto end_time = std::chrono::high_resolution_clock::now();
// output used time on each iteration // output used time on each iteration
Log::Info("Finished loading data in %f seconds", Log::Info("Finished loading data in %f seconds",
...@@ -209,7 +213,7 @@ void Application::InitTrain() { ...@@ -209,7 +213,7 @@ void Application::InitTrain() {
Common::ConstPtrInVectorWrapper<Metric>(train_metric_)); Common::ConstPtrInVectorWrapper<Metric>(train_metric_));
// add validation data into boosting // add validation data into boosting
for (size_t i = 0; i < valid_datas_.size(); ++i) { for (size_t i = 0; i < valid_datas_.size(); ++i) {
boosting_->AddDataset(valid_datas_[i].get(), boosting_->AddValidDataset(valid_datas_[i].get(),
Common::ConstPtrInVectorWrapper<Metric>(valid_metrics_[i])); Common::ConstPtrInVectorWrapper<Metric>(valid_metrics_[i]));
} }
Log::Info("Finished initializing training"); Log::Info("Finished initializing training");
...@@ -227,17 +231,15 @@ void Application::Train() { ...@@ -227,17 +231,15 @@ void Application::Train() {
// output used time per iteration // output used time per iteration
Log::Info("%f seconds elapsed, finished iteration %d", std::chrono::duration<double, Log::Info("%f seconds elapsed, finished iteration %d", std::chrono::duration<double,
std::milli>(end_time - start_time) * 1e-3, iter + 1); std::milli>(end_time - start_time) * 1e-3, iter + 1);
boosting_->SaveModelToFile(NO_LIMIT, is_finished, config_.io_config.output_model.c_str());
} }
is_finished = true;
// save model to file // save model to file
boosting_->SaveModelToFile(NO_LIMIT, is_finished, config_.io_config.output_model.c_str()); boosting_->SaveModelToFile(-1, config_.io_config.output_model.c_str());
Log::Info("Finished training"); Log::Info("Finished training");
} }
void Application::Predict() { void Application::Predict() {
boosting_->SetNumUsedModel(config_.io_config.num_model_predict); boosting_->SetNumIterationForPred(config_.io_config.num_iteration_predict);
// create predictor // create predictor
Predictor predictor(boosting_.get(), config_.io_config.is_predict_raw_score, Predictor predictor(boosting_.get(), config_.io_config.is_predict_raw_score,
config_.io_config.is_predict_leaf_index); config_.io_config.is_predict_leaf_index);
......
...@@ -15,7 +15,7 @@ BoostingType GetBoostingTypeFromModelFile(const char* filename) { ...@@ -15,7 +15,7 @@ BoostingType GetBoostingTypeFromModelFile(const char* filename) {
return BoostingType::kUnknow; return BoostingType::kUnknow;
} }
void LoadFileToBoosting(Boosting* boosting, const char* filename) { void Boosting::LoadFileToBoosting(Boosting* boosting, const char* filename) {
if (boosting != nullptr) { if (boosting != nullptr) {
TextReader<size_t> model_reader(filename, true); TextReader<size_t> model_reader(filename, true);
model_reader.ReadAllLines(); model_reader.ReadAllLines();
......
...@@ -43,6 +43,7 @@ public: ...@@ -43,6 +43,7 @@ public:
* \brief one training iteration * \brief one training iteration
*/ */
bool TrainOneIter(const score_t* gradient, const score_t* hessian, bool is_eval) override { bool TrainOneIter(const score_t* gradient, const score_t* hessian, bool is_eval) override {
is_update_score_cur_iter_ = false;
GBDT::TrainOneIter(gradient, hessian, false); GBDT::TrainOneIter(gradient, hessian, false);
// normalize // normalize
Normalize(); Normalize();
...@@ -58,22 +59,15 @@ public: ...@@ -58,22 +59,15 @@ public:
* \return training score * \return training score
*/ */
const score_t* GetTrainingScore(data_size_t* out_len) override { const score_t* GetTrainingScore(data_size_t* out_len) override {
if (!is_update_score_cur_iter_) {
// only drop one time in one iteration
DroppingTrees(); DroppingTrees();
is_update_score_cur_iter_ = true;
}
*out_len = train_score_updater_->num_data() * num_class_; *out_len = train_score_updater_->num_data() * num_class_;
return train_score_updater_->score(); return train_score_updater_->score();
} }
/*!
* \brief save model to file
* \param num_used_model number of model that want to save, -1 means save all
* \param is_finish is training finished or not
* \param filename filename that want to save to
*/
void SaveModelToFile(int num_used_model, bool is_finish, const char* filename) override {
// only save model once when is_finish = true
if (is_finish && saved_model_size_ < 0) {
GBDT::SaveModelToFile(num_used_model, is_finish, filename);
}
}
/*! /*!
* \brief Get Type name of this boosting object * \brief Get Type name of this boosting object
*/ */
...@@ -133,6 +127,8 @@ private: ...@@ -133,6 +127,8 @@ private:
double drop_rate_; double drop_rate_;
/*! \brief Random generator, used to select dropping trees */ /*! \brief Random generator, used to select dropping trees */
Random random_for_drop_; Random random_for_drop_;
/*! \brief Flag that the score is update on current iter or not*/
bool is_update_score_cur_iter_;
}; };
} // namespace LightGBM } // namespace LightGBM
......
...@@ -16,7 +16,9 @@ ...@@ -16,7 +16,9 @@
namespace LightGBM { namespace LightGBM {
GBDT::GBDT() : saved_model_size_(-1), num_used_model_(0) { GBDT::GBDT()
:num_iteration_for_pred_(0),
num_init_iteration_(0) {
} }
...@@ -26,47 +28,66 @@ GBDT::~GBDT() { ...@@ -26,47 +28,66 @@ GBDT::~GBDT() {
void GBDT::Init(const BoostingConfig* config, const Dataset* train_data, const ObjectiveFunction* object_function, void GBDT::Init(const BoostingConfig* config, const Dataset* train_data, const ObjectiveFunction* object_function,
const std::vector<const Metric*>& training_metrics) { const std::vector<const Metric*>& training_metrics) {
gbdt_config_ = config;
iter_ = 0; iter_ = 0;
saved_model_size_ = -1; num_iteration_for_pred_ = 0;
num_used_model_ = 0;
max_feature_idx_ = 0; max_feature_idx_ = 0;
num_class_ = config->num_class;
train_data_ = nullptr;
ResetTrainingData(config, train_data, object_function, training_metrics);
}
void GBDT::ResetTrainingData(const BoostingConfig* config, const Dataset* train_data, const ObjectiveFunction* object_function,
const std::vector<const Metric*>& training_metrics) {
if (train_data_ != nullptr && !train_data_->CheckAlign(*train_data)) {
Log::Fatal("cannot reset training data, since new training data has different bin mappers");
}
gbdt_config_ = config;
early_stopping_round_ = gbdt_config_->early_stopping_round; early_stopping_round_ = gbdt_config_->early_stopping_round;
shrinkage_rate_ = gbdt_config_->learning_rate; shrinkage_rate_ = gbdt_config_->learning_rate;
train_data_ = train_data; random_ = Random(gbdt_config_->bagging_seed);
num_class_ = config->num_class;
// create tree learner // create tree learner
tree_learner_.clear();
for (int i = 0; i < num_class_; ++i) { for (int i = 0; i < num_class_; ++i) {
auto new_tree_learner = std::unique_ptr<TreeLearner>(TreeLearner::CreateTreeLearner(gbdt_config_->tree_learner_type, gbdt_config_->tree_config)); auto new_tree_learner = std::unique_ptr<TreeLearner>(TreeLearner::CreateTreeLearner(gbdt_config_->tree_learner_type, gbdt_config_->tree_config));
new_tree_learner->Init(train_data_); new_tree_learner->Init(train_data);
// init tree learner // init tree learner
tree_learner_.push_back(std::move(new_tree_learner)); tree_learner_.push_back(std::move(new_tree_learner));
} }
tree_learner_.shrink_to_fit(); tree_learner_.shrink_to_fit();
object_function_ = object_function; object_function_ = object_function;
// push training metrics // push training metrics
training_metrics_.clear();
for (const auto& metric : training_metrics) { for (const auto& metric : training_metrics) {
training_metrics_.push_back(metric); training_metrics_.push_back(metric);
} }
training_metrics_.shrink_to_fit(); training_metrics_.shrink_to_fit();
// create score tracker
train_score_updater_.reset(new ScoreUpdater(train_data_, num_class_));
num_data_ = train_data_->num_data();
// create buffer for gradients and hessians
if (object_function_ != nullptr) {
gradients_ = std::vector<score_t>(num_data_ * num_class_);
hessians_ = std::vector<score_t>(num_data_ * num_class_);
}
sigmoid_ = -1.0f; sigmoid_ = -1.0f;
if (object_function_ != nullptr if (object_function_ != nullptr
&& std::string(object_function_->GetName()) == std::string("binary")) { && std::string(object_function_->GetName()) == std::string("binary")) {
// only binary classification need sigmoid transform // only binary classification need sigmoid transform
sigmoid_ = gbdt_config_->sigmoid; sigmoid_ = gbdt_config_->sigmoid;
} }
if (train_data_ != train_data) {
// not same training data, need reset score and others
// create score tracker
train_score_updater_.reset(new ScoreUpdater(train_data, num_class_));
// update score
for (int i = 0; i < iter_; ++i) {
for (int curr_class = 0; curr_class < num_class_; ++curr_class) {
auto curr_tree = (i + num_init_iteration_) * num_class_ + curr_class;
train_score_updater_->AddScore(models_[curr_tree].get(), curr_class);
}
}
num_data_ = train_data->num_data();
// create buffer for gradients and hessians
if (object_function_ != nullptr) {
gradients_ = std::vector<score_t>(num_data_ * num_class_);
hessians_ = std::vector<score_t>(num_data_ * num_class_);
}
// get max feature index // get max feature index
max_feature_idx_ = train_data_->num_total_features() - 1; max_feature_idx_ = train_data->num_total_features() - 1;
// get label index // get label index
label_idx_ = train_data_->label_idx(); label_idx_ = train_data->label_idx();
// if need bagging, create buffer // if need bagging, create buffer
if (gbdt_config_->bagging_fraction < 1.0 && gbdt_config_->bagging_freq > 0) { if (gbdt_config_->bagging_fraction < 1.0 && gbdt_config_->bagging_freq > 0) {
out_of_bag_data_indices_ = std::vector<data_size_t>(num_data_); out_of_bag_data_indices_ = std::vector<data_size_t>(num_data_);
...@@ -77,18 +98,24 @@ void GBDT::Init(const BoostingConfig* config, const Dataset* train_data, const O ...@@ -77,18 +98,24 @@ void GBDT::Init(const BoostingConfig* config, const Dataset* train_data, const O
bag_data_cnt_ = num_data_; bag_data_cnt_ = num_data_;
bag_data_indices_.clear(); bag_data_indices_.clear();
} }
// initialize random generator }
random_ = Random(gbdt_config_->bagging_seed); train_data_ = train_data;
} }
void GBDT::AddDataset(const Dataset* valid_data, void GBDT::AddValidDataset(const Dataset* valid_data,
const std::vector<const Metric*>& valid_metrics) { const std::vector<const Metric*>& valid_metrics) {
if (iter_ > 0) { if (!train_data_->CheckAlign(*valid_data)) {
Log::Fatal("Cannot add validation data after training started"); Log::Fatal("cannot add validation data, since it has different bin mappers with training data");
} }
// for a validation dataset, we need its score and metric // for a validation dataset, we need its score and metric
auto new_score_updater = std::unique_ptr<ScoreUpdater>(new ScoreUpdater(valid_data, num_class_)); auto new_score_updater = std::unique_ptr<ScoreUpdater>(new ScoreUpdater(valid_data, num_class_));
// update score
for (int i = 0; i < iter_; ++i) {
for (int curr_class = 0; curr_class < num_class_; ++curr_class) {
auto curr_tree = (i + num_init_iteration_) * num_class_ + curr_class;
new_score_updater->AddScore(models_[curr_tree].get(), curr_class);
}
}
valid_score_updater_.push_back(std::move(new_score_updater)); valid_score_updater_.push_back(std::move(new_score_updater));
valid_metrics_.emplace_back(); valid_metrics_.emplace_back();
if (early_stopping_round_ > 0) { if (early_stopping_round_ > 0) {
...@@ -204,6 +231,25 @@ bool GBDT::TrainOneIter(const score_t* gradient, const score_t* hessian, bool is ...@@ -204,6 +231,25 @@ bool GBDT::TrainOneIter(const score_t* gradient, const score_t* hessian, bool is
} }
void GBDT::RollbackOneIter() {
if (iter_ == 0) { return; }
int cur_iter = iter_ + num_init_iteration_ - 1;
// reset score
for (int curr_class = 0; curr_class < num_class_; ++curr_class) {
auto curr_tree = cur_iter * num_class_ + curr_class;
models_[curr_tree]->Shrinkage(-1.0);
train_score_updater_->AddScore(models_[curr_tree].get(), curr_class);
for (auto& score_updater : valid_score_updater_) {
score_updater->AddScore(models_[curr_tree].get(), curr_class);
}
}
// remove model
for (int curr_class = 0; curr_class < num_class_; ++curr_class) {
models_.pop_back();
}
--iter_;
}
bool GBDT::EvalAndCheckEarlyStopping() { bool GBDT::EvalAndCheckEarlyStopping() {
bool is_met_early_stopping = false; bool is_met_early_stopping = false;
// print message for metric // print message for metric
...@@ -236,7 +282,7 @@ bool GBDT::OutputMetric(int iter) { ...@@ -236,7 +282,7 @@ bool GBDT::OutputMetric(int iter) {
auto name = sub_metric->GetName(); auto name = sub_metric->GetName();
auto scores = sub_metric->Eval(train_score_updater_->score()); auto scores = sub_metric->Eval(train_score_updater_->score());
for (size_t k = 0; k < name.size(); ++k) { for (size_t k = 0; k < name.size(); ++k) {
Log::Info("Iteration: %d, %s : %f", iter, name[k].c_str(), scores[k]); Log::Info("Iteration:%d, training %s : %f", iter, name[k].c_str(), scores[k]);
} }
} }
} }
...@@ -248,7 +294,7 @@ bool GBDT::OutputMetric(int iter) { ...@@ -248,7 +294,7 @@ bool GBDT::OutputMetric(int iter) {
if ((iter % gbdt_config_->output_freq) == 0) { if ((iter % gbdt_config_->output_freq) == 0) {
auto name = valid_metrics_[i][j]->GetName(); auto name = valid_metrics_[i][j]->GetName();
for (size_t k = 0; k < name.size(); ++k) { for (size_t k = 0; k < name.size(); ++k) {
Log::Info("Iteration: %d, %s : %f", iter, name[k].c_str(), test_scores[k]); Log::Info("Iteration:%d, valid_%d %s : %f", iter, i + 1, name[k].c_str(), test_scores[k]);
} }
} }
if (!ret && early_stopping_round_ > 0) { if (!ret && early_stopping_round_ > 0) {
...@@ -296,24 +342,23 @@ const score_t* GBDT::GetTrainingScore(data_size_t* out_len) { ...@@ -296,24 +342,23 @@ const score_t* GBDT::GetTrainingScore(data_size_t* out_len) {
return train_score_updater_->score(); return train_score_updater_->score();
} }
void GBDT::GetPredictAt(int data_idx, score_t* out_result, data_size_t* out_len) const { void GBDT::GetPredictAt(int data_idx, score_t* out_result, data_size_t* out_len) {
CHECK(data_idx >= 0 && data_idx <= static_cast<int>(valid_metrics_.size())); CHECK(data_idx >= 0 && data_idx <= static_cast<int>(valid_metrics_.size()));
std::vector<double> ret; std::vector<double> ret;
const score_t* raw_scores = nullptr; const score_t* raw_scores = nullptr;
data_size_t num_data = 0; data_size_t num_data = 0;
if (data_idx == 0) { if (data_idx == 0) {
raw_scores = train_score_updater_->score(); raw_scores = GetTrainingScore(out_len);
num_data = train_score_updater_->num_data(); num_data = train_score_updater_->num_data();
} else { } else {
auto used_idx = data_idx - 1; auto used_idx = data_idx - 1;
raw_scores = valid_score_updater_[used_idx]->score(); raw_scores = valid_score_updater_[used_idx]->score();
num_data = valid_score_updater_[used_idx]->num_data(); num_data = valid_score_updater_[used_idx]->num_data();
}
*out_len = num_data * num_class_; *out_len = num_data * num_class_;
}
if (num_class_ > 1) { if (num_class_ > 1) {
#pragma omp parallel for schedule(guided) #pragma omp parallel for schedule(static)
for (data_size_t i = 0; i < num_data; ++i) { for (data_size_t i = 0; i < num_data; ++i) {
std::vector<double> tmp_result; std::vector<double> tmp_result;
for (int j = 0; j < num_class_; ++j) { for (int j = 0; j < num_class_; ++j) {
...@@ -325,12 +370,12 @@ void GBDT::GetPredictAt(int data_idx, score_t* out_result, data_size_t* out_len) ...@@ -325,12 +370,12 @@ void GBDT::GetPredictAt(int data_idx, score_t* out_result, data_size_t* out_len)
} }
} }
} else if(sigmoid_ > 0.0f){ } else if(sigmoid_ > 0.0f){
#pragma omp parallel for schedule(guided) #pragma omp parallel for schedule(static)
for (data_size_t i = 0; i < num_data; ++i) { for (data_size_t i = 0; i < num_data; ++i) {
out_result[i] = static_cast<score_t>(1.0f / (1.0f + std::exp(-2.0f * sigmoid_ * raw_scores[i]))); out_result[i] = static_cast<score_t>(1.0f / (1.0f + std::exp(-2.0f * sigmoid_ * raw_scores[i])));
} }
} else { } else {
#pragma omp parallel for schedule(guided) #pragma omp parallel for schedule(static)
for (data_size_t i = 0; i < num_data; ++i) { for (data_size_t i = 0; i < num_data; ++i) {
out_result[i] = raw_scores[i]; out_result[i] = raw_scores[i];
} }
...@@ -348,55 +393,85 @@ void GBDT::Boosting() { ...@@ -348,55 +393,85 @@ void GBDT::Boosting() {
GetGradients(GetTrainingScore(&num_score), gradients_.data(), hessians_.data()); GetGradients(GetTrainingScore(&num_score), gradients_.data(), hessians_.data());
} }
void GBDT::SaveModelToFile(int num_used_model, bool is_finish, const char* filename) { std::string GBDT::DumpModel() const {
// first time to this function, open file std::stringstream ss;
if (saved_model_size_ < 0) {
model_output_file_.open(filename); ss << "{";
ss << "\"name\":\"" << Name() << "\"," << std::endl;
ss << "\"num_class\":" << num_class_ << "," << std::endl;
ss << "\"label_index\":" << label_idx_ << "," << std::endl;
ss << "\"max_feature_idx\":" << max_feature_idx_ << "," << std::endl;
if (object_function_ != nullptr) {
ss << "\"objective\":\"" << object_function_->GetName() << "\"," << std::endl;
}
ss << "\"sigmoid\":" << sigmoid_ << "," << std::endl;
ss << "\"tree_info\":[";
for (int i = 0; i < static_cast<int>(models_.size()); ++i) {
if (i > 0) {
ss << ",";
}
ss << "{";
ss << "\"tree_index\":" << i << ",";
ss << models_[i]->ToJSON();
ss << "}";
}
ss << "]," << std::endl;
std::vector<std::pair<size_t, std::string>> pairs = FeatureImportance();
ss << "\"feature_importances\":{" << std::endl;
for (size_t i = 0; i < pairs.size(); ++i) {
if (i > 0) {
ss << ",";
}
ss << "\"" << pairs[i].second << "\":" << pairs[i].first;
}
ss << "}" << std::endl;
ss << "}" << std::endl;
return ss.str();
}
void GBDT::SaveModelToFile(int num_iteration, const char* filename) const {
/*! \brief File to write models */
std::ofstream output_file;
output_file.open(filename);
// output model type // output model type
model_output_file_ << Name() << std::endl; output_file << Name() << std::endl;
// output number of class // output number of class
model_output_file_ << "num_class=" << num_class_ << std::endl; output_file << "num_class=" << num_class_ << std::endl;
// output label index // output label index
model_output_file_ << "label_index=" << label_idx_ << std::endl; output_file << "label_index=" << label_idx_ << std::endl;
// output max_feature_idx // output max_feature_idx
model_output_file_ << "max_feature_idx=" << max_feature_idx_ << std::endl; output_file << "max_feature_idx=" << max_feature_idx_ << std::endl;
// output objective name // output objective name
if (object_function_ != nullptr) { if (object_function_ != nullptr) {
model_output_file_ << "objective=" << object_function_->GetName() << std::endl; output_file << "objective=" << object_function_->GetName() << std::endl;
} }
// output sigmoid parameter // output sigmoid parameter
model_output_file_ << "sigmoid=" << sigmoid_ << std::endl; output_file << "sigmoid=" << sigmoid_ << std::endl;
model_output_file_ << std::endl; output_file << std::endl;
saved_model_size_ = 0;
} int num_used_model = 0;
// already saved if (num_iteration <= 0) {
if (!model_output_file_.is_open()) {
return;
}
if (num_used_model == NO_LIMIT) {
num_used_model = static_cast<int>(models_.size()); num_used_model = static_cast<int>(models_.size());
} else { } else {
num_used_model = num_used_model * num_class_; num_used_model = num_iteration * num_class_;
} }
int rest = num_used_model - early_stopping_round_ * num_class_; num_used_model = std::min(num_used_model, static_cast<int>(models_.size()));
// output tree models // output tree models
for (int i = saved_model_size_; i < rest; ++i) { for (int i = 0; i < num_used_model; ++i) {
model_output_file_ << "Tree=" << i << std::endl; output_file << "Tree=" << i << std::endl;
model_output_file_ << models_[i]->ToString() << std::endl; output_file << models_[i]->ToString() << std::endl;
} }
saved_model_size_ = std::max(saved_model_size_, rest); std::vector<std::pair<size_t, std::string>> pairs = FeatureImportance();
output_file << std::endl << "feature importances:" << std::endl;
model_output_file_.flush(); for (size_t i = 0; i < pairs.size(); ++i) {
// training finished, can close file output_file << pairs[i].second << "=" << std::to_string(pairs[i].first) << std::endl;
if (is_finish) {
for (int i = saved_model_size_; i < num_used_model; ++i) {
model_output_file_ << "Tree=" << i << std::endl;
model_output_file_ << models_[i]->ToString() << std::endl;
}
model_output_file_ << std::endl << FeatureImportance() << std::endl;
model_output_file_.close();
} }
output_file.close();
} }
void GBDT::LoadModelFromString(const std::string& model_str) { void GBDT::LoadModelFromString(const std::string& model_str) {
...@@ -452,10 +527,11 @@ void GBDT::LoadModelFromString(const std::string& model_str) { ...@@ -452,10 +527,11 @@ void GBDT::LoadModelFromString(const std::string& model_str) {
} }
} }
Log::Info("Finished loading %d models", models_.size()); Log::Info("Finished loading %d models", models_.size());
num_used_model_ = static_cast<int>(models_.size()) / num_class_; num_iteration_for_pred_ = static_cast<int>(models_.size()) / num_class_;
num_init_iteration_ = num_iteration_for_pred_;
} }
std::string GBDT::FeatureImportance() const { std::vector<std::pair<size_t, std::string>> GBDT::FeatureImportance() const {
std::vector<size_t> feature_importances(max_feature_idx_ + 1, 0); std::vector<size_t> feature_importances(max_feature_idx_ + 1, 0);
for (size_t iter = 0; iter < models_.size(); ++iter) { for (size_t iter = 0; iter < models_.size(); ++iter) {
for (int split_idx = 0; split_idx < models_[iter]->num_leaves() - 1; ++split_idx) { for (int split_idx = 0; split_idx < models_[iter]->num_leaves() - 1; ++split_idx) {
...@@ -475,18 +551,12 @@ std::string GBDT::FeatureImportance() const { ...@@ -475,18 +551,12 @@ std::string GBDT::FeatureImportance() const {
const std::pair<size_t, std::string>& rhs) { const std::pair<size_t, std::string>& rhs) {
return lhs.first > rhs.first; return lhs.first > rhs.first;
}); });
std::stringstream str_buf; return pairs;
// write to model file
str_buf << std::endl << "feature importances:" << std::endl;
for (size_t i = 0; i < pairs.size(); ++i) {
str_buf << pairs[i].second << "=" << std::to_string(pairs[i].first) << std::endl;
}
return str_buf.str();
} }
std::vector<double> GBDT::PredictRaw(const double* value) const { std::vector<double> GBDT::PredictRaw(const double* value) const {
std::vector<double> ret(num_class_, 0.0f); std::vector<double> ret(num_class_, 0.0f);
for (int i = 0; i < num_used_model_; ++i) { for (int i = 0; i < num_iteration_for_pred_; ++i) {
for (int j = 0; j < num_class_; ++j) { for (int j = 0; j < num_class_; ++j) {
ret[j] += models_[i * num_class_ + j]->Predict(value); ret[j] += models_[i * num_class_ + j]->Predict(value);
} }
...@@ -496,7 +566,7 @@ std::vector<double> GBDT::PredictRaw(const double* value) const { ...@@ -496,7 +566,7 @@ std::vector<double> GBDT::PredictRaw(const double* value) const {
std::vector<double> GBDT::Predict(const double* value) const { std::vector<double> GBDT::Predict(const double* value) const {
std::vector<double> ret(num_class_, 0.0f); std::vector<double> ret(num_class_, 0.0f);
for (int i = 0; i < num_used_model_; ++i) { for (int i = 0; i < num_iteration_for_pred_; ++i) {
for (int j = 0; j < num_class_; ++j) { for (int j = 0; j < num_class_; ++j) {
ret[j] += models_[i * num_class_ + j]->Predict(value); ret[j] += models_[i * num_class_ + j]->Predict(value);
} }
...@@ -512,7 +582,7 @@ std::vector<double> GBDT::Predict(const double* value) const { ...@@ -512,7 +582,7 @@ std::vector<double> GBDT::Predict(const double* value) const {
std::vector<int> GBDT::PredictLeafIndex(const double* value) const { std::vector<int> GBDT::PredictLeafIndex(const double* value) const {
std::vector<int> ret; std::vector<int> ret;
for (int i = 0; i < num_used_model_; ++i) { for (int i = 0; i < num_iteration_for_pred_; ++i) {
for (int j = 0; j < num_class_; ++j) { for (int j = 0; j < num_class_; ++j) {
ret.push_back(models_[i * num_class_ + j]->PredictLeafIndex(value)); ret.push_back(models_[i * num_class_ + j]->PredictLeafIndex(value));
} }
......
...@@ -35,12 +35,53 @@ public: ...@@ -35,12 +35,53 @@ public:
void Init(const BoostingConfig* gbdt_config, const Dataset* train_data, const ObjectiveFunction* object_function, void Init(const BoostingConfig* gbdt_config, const Dataset* train_data, const ObjectiveFunction* object_function,
const std::vector<const Metric*>& training_metrics) const std::vector<const Metric*>& training_metrics)
override; override;
/*!
* \brief Merge model from other boosting object
Will insert to the front of current boosting object
* \param other
*/
void MergeFrom(const Boosting* other) override {
auto other_gbdt = reinterpret_cast<const GBDT*>(other);
// tmp move to other vector
auto original_models = std::move(models_);
models_ = std::vector<std::unique_ptr<Tree>>();
// push model from other first
for (const auto& tree : other_gbdt->models_) {
auto new_tree = std::unique_ptr<Tree>(new Tree(*(tree.get())));
models_.push_back(std::move(new_tree));
}
num_init_iteration_ = static_cast<int>(models_.size()) / num_class_;
// push model in current object
for (const auto& tree : original_models) {
auto new_tree = std::unique_ptr<Tree>(new Tree(*(tree.get())));
models_.push_back(std::move(new_tree));
}
num_iteration_for_pred_ = static_cast<int>(models_.size()) / num_class_;
}
/*!
* \brief Reset training data for current boosting
* \param train_data Training data
* \param object_function Training objective function
* \param training_metrics Training metric
*/
void ResetTrainingData(const BoostingConfig* config, const Dataset* train_data, const ObjectiveFunction* object_function, const std::vector<const Metric*>& training_metrics) override;
/*!
* \brief Reset shrinkage_rate data for current boosting
* \param shrinkage_rate Configs for boosting
*/
void ResetShrinkageRate(double shrinkage_rate) override {
shrinkage_rate_ = shrinkage_rate;
}
/*! /*!
* \brief Adding a validation dataset * \brief Adding a validation dataset
* \param valid_data Validation dataset * \param valid_data Validation dataset
* \param valid_metrics Metrics for validation dataset * \param valid_metrics Metrics for validation dataset
*/ */
void AddDataset(const Dataset* valid_data, void AddValidDataset(const Dataset* valid_data,
const std::vector<const Metric*>& valid_metrics) override; const std::vector<const Metric*>& valid_metrics) override;
/*! /*!
* \brief Training logic * \brief Training logic
...@@ -51,6 +92,13 @@ public: ...@@ -51,6 +92,13 @@ public:
*/ */
virtual bool TrainOneIter(const score_t* gradient, const score_t* hessian, bool is_eval) override; virtual bool TrainOneIter(const score_t* gradient, const score_t* hessian, bool is_eval) override;
/*!
* \brief Rollback one iteration
*/
void RollbackOneIter() override;
int GetCurrentIteration() const override { return iter_ + num_init_iteration_; }
bool EvalAndCheckEarlyStopping() override; bool EvalAndCheckEarlyStopping() override;
/*! /*!
...@@ -73,40 +121,48 @@ public: ...@@ -73,40 +121,48 @@ public:
* \param result used to store prediction result, should allocate memory before call this function * \param result used to store prediction result, should allocate memory before call this function
* \param out_len lenght of returned score * \param out_len lenght of returned score
*/ */
void GetPredictAt(int data_idx, score_t* out_result, data_size_t* out_len) const override; void GetPredictAt(int data_idx, score_t* out_result, data_size_t* out_len) override;
/*! /*!
* \brief Predtion for one record without sigmoid transformation * \brief Prediction for one record without sigmoid transformation
* \param feature_values Feature value on this record * \param feature_values Feature value on this record
* \return Prediction result for this record * \return Prediction result for this record
*/ */
std::vector<double> PredictRaw(const double* feature_values) const override; std::vector<double> PredictRaw(const double* feature_values) const override;
/*! /*!
* \brief Predtion for one record with sigmoid transformation if enabled * \brief Prediction for one record with sigmoid transformation if enabled
* \param feature_values Feature value on this record * \param feature_values Feature value on this record
* \return Prediction result for this record * \return Prediction result for this record
*/ */
std::vector<double> Predict(const double* feature_values) const override; std::vector<double> Predict(const double* feature_values) const override;
/*! /*!
* \brief Predtion for one record with leaf index * \brief Prediction for one record with leaf index
* \param feature_values Feature value on this record * \param feature_values Feature value on this record
* \return Predicted leaf index for this record * \return Predicted leaf index for this record
*/ */
std::vector<int> PredictLeafIndex(const double* value) const override; std::vector<int> PredictLeafIndex(const double* value) const override;
/*! /*!
* \brief save model to file * \brief Dump model to json format string
* \param num_used_model number of model that want to save, -1 means save all * \return Json format string of model
* \param is_finish is training finished or not
* \param filename filename that want to save to
*/ */
virtual void SaveModelToFile(int num_used_model, bool is_finish, const char* filename) override; std::string DumpModel() const override;
/*!
* \brief Save model to file
* \param num_used_model Number of model that want to save, -1 means save all
* \param is_finish Is training finished or not
* \param filename Filename that want to save to
*/
virtual void SaveModelToFile(int num_iterations, const char* filename) const override ;
/*! /*!
* \brief Restore from a serialized string * \brief Restore from a serialized string
*/ */
void LoadModelFromString(const std::string& model_str) override; void LoadModelFromString(const std::string& model_str) override;
/*! /*!
* \brief Get max feature index of this model * \brief Get max feature index of this model
* \return Max feature index of this model * \return Max feature index of this model
...@@ -119,11 +175,12 @@ public: ...@@ -119,11 +175,12 @@ public:
*/ */
inline int LabelIdx() const override { return label_idx_; } inline int LabelIdx() const override { return label_idx_; }
/*! /*!
* \brief Get number of weak sub-models * \brief Get number of weak sub-models
* \return Number of weak sub-models * \return Number of weak sub-models
*/ */
inline int NumberOfSubModels() const override { return static_cast<int>(models_.size()); } inline int NumberOfTotalModel() const override { return static_cast<int>(models_.size()); }
/*! /*!
* \brief Get number of classes * \brief Get number of classes
...@@ -132,12 +189,16 @@ public: ...@@ -132,12 +189,16 @@ public:
inline int NumberOfClasses() const override { return num_class_; } inline int NumberOfClasses() const override { return num_class_; }
/*! /*!
* \brief Set number of used model for prediction * \brief Set number of iterations for prediction
*/ */
inline void SetNumUsedModel(int num_used_model) { inline void SetNumIterationForPred(int num_iteration) override {
if (num_used_model >= 0) { if (num_iteration > 0) {
num_used_model_ = static_cast<int>(num_used_model / num_class_); num_iteration_for_pred_ = num_iteration;
} else {
num_iteration_for_pred_ = static_cast<int>(models_.size()) / num_class_;
} }
num_iteration_for_pred_ = std::min(num_iteration_for_pred_,
static_cast<int>(models_.size()) / num_class_);
} }
/*! /*!
...@@ -178,7 +239,7 @@ protected: ...@@ -178,7 +239,7 @@ protected:
* \brief Calculate feature importances * \brief Calculate feature importances
* \param last_iter Last tree use to calculate * \param last_iter Last tree use to calculate
*/ */
std::string FeatureImportance() const; std::vector<std::pair<size_t, std::string>> FeatureImportance() const;
/*! \brief current iteration */ /*! \brief current iteration */
int iter_; int iter_;
/*! \brief Pointer to training data */ /*! \brief Pointer to training data */
...@@ -218,7 +279,7 @@ protected: ...@@ -218,7 +279,7 @@ protected:
std::vector<data_size_t> bag_data_indices_; std::vector<data_size_t> bag_data_indices_;
/*! \brief Number of in-bag data */ /*! \brief Number of in-bag data */
data_size_t bag_data_cnt_; data_size_t bag_data_cnt_;
/*! \brief Number of traning data */ /*! \brief Number of training data */
data_size_t num_data_; data_size_t num_data_;
/*! \brief Number of classes */ /*! \brief Number of classes */
int num_class_; int num_class_;
...@@ -226,19 +287,17 @@ protected: ...@@ -226,19 +287,17 @@ protected:
Random random_; Random random_;
/*! /*!
* \brief Sigmoid parameter, used for prediction. * \brief Sigmoid parameter, used for prediction.
* if > 0 meas output score will transform by sigmoid function * if > 0 means output score will transform by sigmoid function
*/ */
double sigmoid_; double sigmoid_;
/*! \brief Index of label column */ /*! \brief Index of label column */
data_size_t label_idx_; data_size_t label_idx_;
/*! \brief Saved number of models */
int saved_model_size_;
/*! \brief File to write models */
std::ofstream model_output_file_;
/*! \brief number of used model */ /*! \brief number of used model */
int num_used_model_; int num_iteration_for_pred_;
/*! \brief Shrinkage rate for one iteration */ /*! \brief Shrinkage rate for one iteration */
double shrinkage_rate_; double shrinkage_rate_;
/*! \brief Number of loaded initial models */
int num_init_iteration_;
}; };
} // namespace LightGBM } // namespace LightGBM
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
...@@ -279,7 +279,7 @@ inline VAL_T SparseBinIterator<VAL_T>::InnerGet(data_size_t idx) { ...@@ -279,7 +279,7 @@ inline VAL_T SparseBinIterator<VAL_T>::InnerGet(data_size_t idx) {
while (cur_pos_ < idx && i_delta_ < bin_data_->num_vals_) { while (cur_pos_ < idx && i_delta_ < bin_data_->num_vals_) {
bin_data_->NextNonzero(&i_delta_, &cur_pos_); bin_data_->NextNonzero(&i_delta_, &cur_pos_);
} }
if (cur_pos_ == idx && i_delta_ < bin_data_->num_vals_) { if (cur_pos_ == idx && i_delta_ < bin_data_->num_vals_ && i_delta_ >= 0) {
return bin_data_->vals_[i_delta_]; return bin_data_->vals_[i_delta_];
} else { } else {
return 0; return 0;
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
...@@ -33,12 +33,9 @@ public: ...@@ -33,12 +33,9 @@ public:
~NDCGMetric() { ~NDCGMetric() {
} }
void Init(const char* test_name, const Metadata& metadata, data_size_t num_data) override { void Init(const Metadata& metadata, data_size_t num_data) override {
for (auto k : eval_at_) { for (auto k : eval_at_) {
std::stringstream str_buf; name_.emplace_back(std::string("ndcg@") + std::to_string(k));
str_buf << test_name << "'s : ";
str_buf << "NDCG@" + std::to_string(k) + " ";
name_.emplace_back(str_buf.str());
} }
num_data_ = num_data; num_data_ = num_data;
// get label // get label
......
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment