Commit 49d85642 authored by Guolin Ke's avatar Guolin Ke
Browse files

add sklearn like basic model

parent c861be93
...@@ -138,6 +138,8 @@ public: ...@@ -138,6 +138,8 @@ public:
bool is_unbalance = false; bool is_unbalance = false;
// for multiclass // for multiclass
int num_class = 1; int num_class = 1;
// Balancing of positive and negative weights
double scale_pos_weight = 1.0f;
void Set(const std::unordered_map<std::string, std::string>& params) override; void Set(const std::unordered_map<std::string, std::string>& params) override;
}; };
...@@ -333,14 +335,18 @@ struct ParameterAlias { ...@@ -333,14 +335,18 @@ struct ParameterAlias {
{ "min_sum_hessian_per_leaf", "min_sum_hessian_in_leaf" }, { "min_sum_hessian_per_leaf", "min_sum_hessian_in_leaf" },
{ "min_sum_hessian", "min_sum_hessian_in_leaf" }, { "min_sum_hessian", "min_sum_hessian_in_leaf" },
{ "min_hessian", "min_sum_hessian_in_leaf" }, { "min_hessian", "min_sum_hessian_in_leaf" },
{ "min_child_weight", "min_sum_hessian_in_leaf" },
{ "num_leaf", "num_leaves" }, { "num_leaf", "num_leaves" },
{ "sub_feature", "feature_fraction" }, { "sub_feature", "feature_fraction" },
{ "colsample_bytree", "feature_fraction" },
{ "num_iteration", "num_iterations" }, { "num_iteration", "num_iterations" },
{ "num_tree", "num_iterations" }, { "num_tree", "num_iterations" },
{ "num_round", "num_iterations" }, { "num_round", "num_iterations" },
{ "num_trees", "num_iterations" }, { "num_trees", "num_iterations" },
{ "num_rounds", "num_iterations" }, { "num_rounds", "num_iterations" },
{ "sub_row", "bagging_fraction" }, { "sub_row", "bagging_fraction" },
{ "subsample", "bagging_fraction" },
{ "subsample_freq", "bagging_freq" },
{ "shrinkage_rate", "learning_rate" }, { "shrinkage_rate", "learning_rate" },
{ "tree", "tree_learner" }, { "tree", "tree_learner" },
{ "num_machine", "num_machines" }, { "num_machine", "num_machines" },
...@@ -363,6 +369,9 @@ struct ParameterAlias { ...@@ -363,6 +369,9 @@ struct ParameterAlias {
{ "blacklist", "ignore_column" }, { "blacklist", "ignore_column" },
{ "predict_raw_score", "is_predict_raw_score" }, { "predict_raw_score", "is_predict_raw_score" },
{ "predict_leaf_index", "is_predict_leaf_index" }, { "predict_leaf_index", "is_predict_leaf_index" },
{ "gamma", "min_gain_to_split" },
{ "reg_alpha", "lambda_l1" },
{ "reg_lambda", "lambda_l2" },
{ "num_classes", "num_class" } { "num_classes", "num_class" }
}); });
std::unordered_map<std::string, std::string> tmp_map; std::unordered_map<std::string, std::string> tmp_map;
......
...@@ -80,7 +80,7 @@ def record_evaluation(eval_result): ...@@ -80,7 +80,7 @@ def record_evaluation(eval_result):
def init(env): def init(env):
"""internal function""" """internal function"""
for data_name, eval_name, _ in env.evaluation_result_list: for data_name, eval_name, _, _ in env.evaluation_result_list:
if data_name not in eval_result: if data_name not in eval_result:
eval_result[data_name] = {} eval_result[data_name] = {}
if eval_name not in eval_result[data_name]: if eval_name not in eval_result[data_name]:
...@@ -90,7 +90,7 @@ def record_evaluation(eval_result): ...@@ -90,7 +90,7 @@ def record_evaluation(eval_result):
"""internal function""" """internal function"""
if len(eval_result) == 0: if len(eval_result) == 0:
init(env) init(env)
for data_name, eval_name, result in env.evaluation_result_list: for data_name, eval_name, result, _ in env.evaluation_result_list:
eval_result[data_name][eval_name].append(result) eval_result[data_name][eval_name].append(result)
return callback return callback
......
...@@ -41,7 +41,7 @@ def train(params, train_data, num_boost_round=100, ...@@ -41,7 +41,7 @@ def train(params, train_data, num_boost_round=100,
valid_datas=None, valid_names=None, valid_datas=None, valid_names=None,
fobj=None, feval=None, init_model=None, fobj=None, feval=None, init_model=None,
train_fields=None, valid_fields=None, train_fields=None, valid_fields=None,
early_stopping_rounds=None, out_eval_result=None, early_stopping_rounds=None, evals_result=None,
verbose_eval=True, learning_rates=None, callbacks=None): verbose_eval=True, learning_rates=None, callbacks=None):
"""Train with given parameters. """Train with given parameters.
...@@ -76,7 +76,7 @@ def train(params, train_data, num_boost_round=100, ...@@ -76,7 +76,7 @@ def train(params, train_data, num_boost_round=100,
If there's more than one, will check all of them If there's more than one, will check all of them
Returns the model with (best_iter + early_stopping_rounds) Returns the model with (best_iter + early_stopping_rounds)
If early stopping occurs, the model will add 'best_iteration' field If early stopping occurs, the model will add 'best_iteration' field
out_eval_result: dict or None evals_result: dict or None
This dictionary used to store all evaluation results of all the items in valid_datas. This dictionary used to store all evaluation results of all the items in valid_datas.
Example: with a valid_datas containing [valid_set, train_set] and valid_names containing ['eval', 'train'] and Example: with a valid_datas containing [valid_set, train_set] and valid_names containing ['eval', 'train'] and
a paramater containing ('metric':'logloss') a paramater containing ('metric':'logloss')
...@@ -157,14 +157,20 @@ def train(params, train_data, num_boost_round=100, ...@@ -157,14 +157,20 @@ def train(params, train_data, num_boost_round=100,
if learning_rates is not None: if learning_rates is not None:
callbacks.append(callback.reset_learning_rate(learning_rates)) callbacks.append(callback.reset_learning_rate(learning_rates))
if out_eval_result is not None: if evals_result is not None:
callbacks.append(callback.record_evaluation(out_eval_result)) callbacks.append(callback.record_evaluation(evals_result))
callbacks_before_iter = [ callbacks_before_iter = [
cb for cb in callbacks if cb.__dict__.get('before_iteration', False)] cb for cb in callbacks if cb.__dict__.get('before_iteration', False)]
callbacks_after_iter = [ callbacks_after_iter = [
cb for cb in callbacks if not cb.__dict__.get('before_iteration', False)] cb for cb in callbacks if not cb.__dict__.get('before_iteration', False)]
"""construct booster""" """construct booster"""
if 'metric' in params:
if is_str(params['metric']):
params['metric'] = params['metric'].split(',')
else:
params['metric'] = list(params['metric'])
booster = Booster(params=params, train_set=train_set) booster = Booster(params=params, train_set=train_set)
if is_valid_contain_train: if is_valid_contain_train:
booster.set_train_data_name(train_data_name) booster.set_train_data_name(train_data_name)
......
"""Scikit-Learn Wrapper interface for LightGBM."""
from __future__ import absolute_import
import numpy as np
from .basic import LightGBMError, Predictor, Dataset, Booster, is_str
from .engine import train
# sklearn
try:
from sklearn.base import BaseEstimator
from sklearn.base import RegressorMixin, ClassifierMixin
from sklearn.preprocessing import LabelEncoder
SKLEARN_INSTALLED = True
LGBMModelBase = BaseEstimator
LGBMRegressorBase = RegressorMixin
LGBMClassifierBase = ClassifierMixin
LGBMLabelEncoder = LabelEncoder
except ImportError:
SKLEARN_INSTALLED = False
LGBMModelBase = object
LGBMClassifierBase = object
LGBMRegressorBase = object
LGBMLabelEncoder = None
def _objective_decorator(func):
"""Decorate an objective function
Converts an objective function using the typical sklearn metrics to LightGBM ffobj
Note: for multi-class task, the label/pred is group by class_id first, then group by row_id
if you want to get i-th row label/pred in j-th class, the access way is label/pred[j*num_data+i]
and you should group grad and hess in this way as well
Parameters
----------
func: callable
Expects a callable with signature ``func(y_true, y_pred)``:
y_true: array_like of shape [n_samples]
The target values
y_pred: array_like of shape [n_samples]
The predicted values
Returns
-------
new_func: callable
The new objective function as expected by ``lightgbm.engine.train``.
The signature is ``new_func(preds, dataset)``:
preds: array_like, shape [n_samples]
The predicted values
dataset: ``dataset``
The training set from which the labels will be extracted using
``dataset.get_label()``
"""
def inner(preds, dataset):
"""internal function"""
labels = dataset.get_label()
return func(labels, preds)
return inner
class LGBMModel(LGBMModelBase):
"""Implementation of the Scikit-Learn API for LightGBM.
Parameters
----------
num_leaves : int
Maximum tree leaves for base learners.
max_depth : int
Maximum tree depth for base learners, -1 means not limit.
learning_rate : float
Boosting learning rate
n_estimators : int
Number of boosted trees to fit.
silent : boolean
Whether to print messages while running boosting.
objective : string or callable
Specify the learning task and the corresponding learning objective or
a custom objective function to be used (see note below).
num_class: int
only affect for multi-class training.
nthread : int
Number of parallel threads
gamma : float
Minimum loss reduction required to make a further partition on a leaf node of the tree.
min_child_weight : int
Minimum sum of instance weight(hessian) needed in a child.
subsample : float
Subsample ratio of the training instance.
subsample_freq : int
frequence of subsample, <=0 means no enable
colsample_bytree : float
Subsample ratio of columns when constructing each tree.
colsample_byleaf : float
Subsample ratio of columns when constructing each leaf.
reg_alpha : float
L1 regularization term on weights
reg_lambda : float
L2 regularization term on weights
scale_pos_weight : float
Balancing of positive and negative weights.
is_unbalance : bool
Is unbalance for binary classification
seed : int
Random number seed.
Note
----
A custom objective function can be provided for the ``objective``
parameter. In this case, it should have the signature
``objective(y_true, y_pred) -> grad, hess``:
y_true: array_like of shape [n_samples]
The target values
y_pred: array_like of shape [n_samples]
The predicted values
grad: array_like of shape [n_samples]
The value of the gradient for each sample point.
hess: array_like of shape [n_samples]
The value of the second derivative for each sample point
for multi-class task, the label/pred is group by class_id first, then group by row_id
if you want to get i-th row label/pred in j-th class, the access way is label/pred[j*num_data+i]
and you should group grad and hess in this way as well
"""
def __init__(self, num_leaves=63, max_depth=-1,
learning_rate=0.1, n_estimators=100, max_bin=255,
silent=True, objective="regression", num_class=1,
nthread=-1, gamma=0, min_child_weight=1,
subsample=1, subsample_freq=1, colsample_bytree=1, colsample_byleaf=1,
reg_alpha=0, reg_lambda=0, scale_pos_weight=1,
is_unbalance=False, seed=0):
if not SKLEARN_INSTALLED:
raise LightGBMError('sklearn needs to be installed in order to use this module')
self.num_leaves = num_leaves
self.max_depth = max_depth
self.learning_rate = learning_rate
self.n_estimators = n_estimators
self.max_bin = max_bin
self.silent = silent
self.objective = objective
self.num_class = num_class
self.nthread = nthread
self.gamma = gamma
self.min_child_weight = min_child_weight
self.subsample = subsample
self.subsample_freq = subsample_freq
self.colsample_bytree = colsample_bytree
self.colsample_byleaf = colsample_byleaf
self.reg_alpha = reg_alpha
self.reg_lambda = reg_lambda
self.scale_pos_weight = scale_pos_weight
self.is_unbalance = is_unbalance
self.seed = seed
self._Booster = None
def booster(self):
"""Get the underlying lightgbm Booster of this model.
This will raise an exception when fit was not called
Returns
-------
booster : a lightgbm booster of underlying model
"""
if self._Booster is None:
raise LightGBMError('need to call fit beforehand')
return self._Booster
def get_params(self, deep=False):
"""Get parameter.s"""
params = super(LGBMModel, self).get_params(deep=deep)
params['verbose'] = 0 if self.silent else 1
if self.nthread <= 0:
params.pop('nthread', None)
return params
def fit(self, X, y, eval_set=None, eval_metric=None,
early_stopping_rounds=None, verbose=True):
"""
Fit the gradient boosting model
Parameters
----------
X : array_like
Feature matrix
y : array_like
Labels
eval_set : list, optional
A list of (X, y) tuple pairs to use as a validation set for early-stopping
eval_metric : str, list of str, callable, optional
If a str, should be a built-in evaluation metric to use. See
doc/parameter.md. If callable, a custom evaluation metric. The call
signature is func(y_predicted, y_true) where y_true will be a
Dataset fobject such that you may need to call the get_label
method. And it must return (eval_name, feature_result, is_bigger_better)
early_stopping_rounds : int
verbose : bool
If `verbose` and an evaluation set is used, writes the evaluation
metric measured on the validation set to stderr.
"""
evals_result = {}
params = self.get_params()
if callable(self.objective):
fobj = _objective_decorator(self.objective)
params["objective"] = "None"
else:
fobj = None
if callable(eval_metric):
feval = eval_metric
else:
feval = None
params.update({'metric': eval_metric})
feval = eval_metric if callable(eval_metric) else None
self._Booster = train(params, (X, y),
self.n_estimators, valid_datas=eval_set,
early_stopping_rounds=early_stopping_rounds,
evals_result=evals_result, fobj=fobj, feval=feval,
verbose_eval=verbose)
if evals_result:
for val in evals_result.items():
evals_result_key = list(val[1].keys())[0]
evals_result[val[0]][evals_result_key] = val[1][evals_result_key]
self.evals_result_ = evals_result
if early_stopping_rounds is not None:
self.best_iteration = self._Booster.best_iteration
return self
def predict(self, data, raw_score=False, num_iteration=0):
return self.booster().predict(data,
raw_score=raw_score,
num_iteration=num_iteration)
def apply(self, X, num_iteration=0):
"""Return the predicted leaf every tree for each sample.
Parameters
----------
X : array_like, shape=[n_samples, n_features]
Input features matrix.
ntree_limit : int
Limit number of trees in the prediction; defaults to 0 (use all trees).
Returns
-------
X_leaves : array_like, shape=[n_samples, n_trees]
"""
return self.booster().predict(X,
pred_leaf=True,
num_iteration=num_iteration)
def evals_result(self):
"""Return the evaluation results.
Returns
-------
evals_result : dictionary
"""
if self.evals_result_:
evals_result = self.evals_result_
else:
raise LightGBMError('No results.')
return evals_result
\ No newline at end of file
...@@ -213,6 +213,7 @@ void ObjectiveConfig::Set(const std::unordered_map<std::string, std::string>& pa ...@@ -213,6 +213,7 @@ void ObjectiveConfig::Set(const std::unordered_map<std::string, std::string>& pa
CHECK(max_position > 0); CHECK(max_position > 0);
GetInt(params, "num_class", &num_class); GetInt(params, "num_class", &num_class);
CHECK(num_class >= 1); CHECK(num_class >= 1);
GetDouble(params, "scale_pos_weight", &scale_pos_weight);
std::string tmp_str = ""; std::string tmp_str = "";
if (GetString(params, "label_gain", &tmp_str)) { if (GetString(params, "label_gain", &tmp_str)) {
label_gain = Common::StringToDoubleArray(tmp_str, ','); label_gain = Common::StringToDoubleArray(tmp_str, ',');
......
...@@ -18,6 +18,7 @@ public: ...@@ -18,6 +18,7 @@ public:
if (sigmoid_ <= 0.0) { if (sigmoid_ <= 0.0) {
Log::Fatal("Sigmoid parameter %f should be greater than zero", sigmoid_); Log::Fatal("Sigmoid parameter %f should be greater than zero", sigmoid_);
} }
scale_pos_weight_ = static_cast<score_t>(config.scale_pos_weight);
} }
~BinaryLogloss() {} ~BinaryLogloss() {}
void Init(const Metadata& metadata, data_size_t num_data) override { void Init(const Metadata& metadata, data_size_t num_data) override {
...@@ -55,6 +56,7 @@ public: ...@@ -55,6 +56,7 @@ public:
label_weights_[0] = 1.0f; label_weights_[0] = 1.0f;
} }
} }
label_weights_[1] *= scale_pos_weight_;
} }
void GetGradients(const score_t* score, score_t* gradients, score_t* hessians) const override { void GetGradients(const score_t* score, score_t* gradients, score_t* hessians) const override {
...@@ -104,6 +106,7 @@ private: ...@@ -104,6 +106,7 @@ private:
score_t label_weights_[2]; score_t label_weights_[2];
/*! \brief Weights for data */ /*! \brief Weights for data */
const float* weights_; const float* weights_;
score_t scale_pos_weight_;
}; };
} // namespace LightGBM } // namespace LightGBM
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment