Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
tianlh
LightGBM-DCU
Commits
49d85642
Commit
49d85642
authored
Nov 29, 2016
by
Guolin Ke
Browse files
add sklearn like basic model
parent
c861be93
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
295 additions
and
6 deletions
+295
-6
include/LightGBM/config.h
include/LightGBM/config.h
+9
-0
python-package/lightgbm/callback.py
python-package/lightgbm/callback.py
+2
-2
python-package/lightgbm/engine.py
python-package/lightgbm/engine.py
+10
-4
python-package/lightgbm/sklearn.py
python-package/lightgbm/sklearn.py
+270
-0
src/io/config.cpp
src/io/config.cpp
+1
-0
src/objective/binary_objective.hpp
src/objective/binary_objective.hpp
+3
-0
No files found.
include/LightGBM/config.h
View file @
49d85642
...
...
@@ -138,6 +138,8 @@ public:
bool
is_unbalance
=
false
;
// for multiclass
int
num_class
=
1
;
// Balancing of positive and negative weights
double
scale_pos_weight
=
1.0
f
;
void
Set
(
const
std
::
unordered_map
<
std
::
string
,
std
::
string
>&
params
)
override
;
};
...
...
@@ -333,14 +335,18 @@ struct ParameterAlias {
{
"min_sum_hessian_per_leaf"
,
"min_sum_hessian_in_leaf"
},
{
"min_sum_hessian"
,
"min_sum_hessian_in_leaf"
},
{
"min_hessian"
,
"min_sum_hessian_in_leaf"
},
{
"min_child_weight"
,
"min_sum_hessian_in_leaf"
},
{
"num_leaf"
,
"num_leaves"
},
{
"sub_feature"
,
"feature_fraction"
},
{
"colsample_bytree"
,
"feature_fraction"
},
{
"num_iteration"
,
"num_iterations"
},
{
"num_tree"
,
"num_iterations"
},
{
"num_round"
,
"num_iterations"
},
{
"num_trees"
,
"num_iterations"
},
{
"num_rounds"
,
"num_iterations"
},
{
"sub_row"
,
"bagging_fraction"
},
{
"subsample"
,
"bagging_fraction"
},
{
"subsample_freq"
,
"bagging_freq"
},
{
"shrinkage_rate"
,
"learning_rate"
},
{
"tree"
,
"tree_learner"
},
{
"num_machine"
,
"num_machines"
},
...
...
@@ -363,6 +369,9 @@ struct ParameterAlias {
{
"blacklist"
,
"ignore_column"
},
{
"predict_raw_score"
,
"is_predict_raw_score"
},
{
"predict_leaf_index"
,
"is_predict_leaf_index"
},
{
"gamma"
,
"min_gain_to_split"
},
{
"reg_alpha"
,
"lambda_l1"
},
{
"reg_lambda"
,
"lambda_l2"
},
{
"num_classes"
,
"num_class"
}
});
std
::
unordered_map
<
std
::
string
,
std
::
string
>
tmp_map
;
...
...
python-package/lightgbm/callback.py
View file @
49d85642
...
...
@@ -80,7 +80,7 @@ def record_evaluation(eval_result):
def
init
(
env
):
"""internal function"""
for
data_name
,
eval_name
,
_
in
env
.
evaluation_result_list
:
for
data_name
,
eval_name
,
_
,
_
in
env
.
evaluation_result_list
:
if
data_name
not
in
eval_result
:
eval_result
[
data_name
]
=
{}
if
eval_name
not
in
eval_result
[
data_name
]:
...
...
@@ -90,7 +90,7 @@ def record_evaluation(eval_result):
"""internal function"""
if
len
(
eval_result
)
==
0
:
init
(
env
)
for
data_name
,
eval_name
,
result
in
env
.
evaluation_result_list
:
for
data_name
,
eval_name
,
result
,
_
in
env
.
evaluation_result_list
:
eval_result
[
data_name
][
eval_name
].
append
(
result
)
return
callback
...
...
python-package/lightgbm/engine.py
View file @
49d85642
...
...
@@ -41,7 +41,7 @@ def train(params, train_data, num_boost_round=100,
valid_datas
=
None
,
valid_names
=
None
,
fobj
=
None
,
feval
=
None
,
init_model
=
None
,
train_fields
=
None
,
valid_fields
=
None
,
early_stopping_rounds
=
None
,
out_
eval_result
=
None
,
early_stopping_rounds
=
None
,
eval
s
_result
=
None
,
verbose_eval
=
True
,
learning_rates
=
None
,
callbacks
=
None
):
"""Train with given parameters.
...
...
@@ -76,7 +76,7 @@ def train(params, train_data, num_boost_round=100,
If there's more than one, will check all of them
Returns the model with (best_iter + early_stopping_rounds)
If early stopping occurs, the model will add 'best_iteration' field
out_
eval_result: dict or None
eval
s
_result: dict or None
This dictionary used to store all evaluation results of all the items in valid_datas.
Example: with a valid_datas containing [valid_set, train_set] and valid_names containing ['eval', 'train'] and
a paramater containing ('metric':'logloss')
...
...
@@ -157,14 +157,20 @@ def train(params, train_data, num_boost_round=100,
if
learning_rates
is
not
None
:
callbacks
.
append
(
callback
.
reset_learning_rate
(
learning_rates
))
if
out_
eval_result
is
not
None
:
callbacks
.
append
(
callback
.
record_evaluation
(
out_
eval_result
))
if
eval
s
_result
is
not
None
:
callbacks
.
append
(
callback
.
record_evaluation
(
eval
s
_result
))
callbacks_before_iter
=
[
cb
for
cb
in
callbacks
if
cb
.
__dict__
.
get
(
'before_iteration'
,
False
)]
callbacks_after_iter
=
[
cb
for
cb
in
callbacks
if
not
cb
.
__dict__
.
get
(
'before_iteration'
,
False
)]
"""construct booster"""
if
'metric'
in
params
:
if
is_str
(
params
[
'metric'
]):
params
[
'metric'
]
=
params
[
'metric'
].
split
(
','
)
else
:
params
[
'metric'
]
=
list
(
params
[
'metric'
])
booster
=
Booster
(
params
=
params
,
train_set
=
train_set
)
if
is_valid_contain_train
:
booster
.
set_train_data_name
(
train_data_name
)
...
...
python-package/lightgbm/sklearn.py
0 → 100644
View file @
49d85642
"""Scikit-Learn Wrapper interface for LightGBM."""
from
__future__
import
absolute_import
import
numpy
as
np
from
.basic
import
LightGBMError
,
Predictor
,
Dataset
,
Booster
,
is_str
from
.engine
import
train
# sklearn
try
:
from
sklearn.base
import
BaseEstimator
from
sklearn.base
import
RegressorMixin
,
ClassifierMixin
from
sklearn.preprocessing
import
LabelEncoder
SKLEARN_INSTALLED
=
True
LGBMModelBase
=
BaseEstimator
LGBMRegressorBase
=
RegressorMixin
LGBMClassifierBase
=
ClassifierMixin
LGBMLabelEncoder
=
LabelEncoder
except
ImportError
:
SKLEARN_INSTALLED
=
False
LGBMModelBase
=
object
LGBMClassifierBase
=
object
LGBMRegressorBase
=
object
LGBMLabelEncoder
=
None
def
_objective_decorator
(
func
):
"""Decorate an objective function
Converts an objective function using the typical sklearn metrics to LightGBM ffobj
Note: for multi-class task, the label/pred is group by class_id first, then group by row_id
if you want to get i-th row label/pred in j-th class, the access way is label/pred[j*num_data+i]
and you should group grad and hess in this way as well
Parameters
----------
func: callable
Expects a callable with signature ``func(y_true, y_pred)``:
y_true: array_like of shape [n_samples]
The target values
y_pred: array_like of shape [n_samples]
The predicted values
Returns
-------
new_func: callable
The new objective function as expected by ``lightgbm.engine.train``.
The signature is ``new_func(preds, dataset)``:
preds: array_like, shape [n_samples]
The predicted values
dataset: ``dataset``
The training set from which the labels will be extracted using
``dataset.get_label()``
"""
def
inner
(
preds
,
dataset
):
"""internal function"""
labels
=
dataset
.
get_label
()
return
func
(
labels
,
preds
)
return
inner
class
LGBMModel
(
LGBMModelBase
):
"""Implementation of the Scikit-Learn API for LightGBM.
Parameters
----------
num_leaves : int
Maximum tree leaves for base learners.
max_depth : int
Maximum tree depth for base learners, -1 means not limit.
learning_rate : float
Boosting learning rate
n_estimators : int
Number of boosted trees to fit.
silent : boolean
Whether to print messages while running boosting.
objective : string or callable
Specify the learning task and the corresponding learning objective or
a custom objective function to be used (see note below).
num_class: int
only affect for multi-class training.
nthread : int
Number of parallel threads
gamma : float
Minimum loss reduction required to make a further partition on a leaf node of the tree.
min_child_weight : int
Minimum sum of instance weight(hessian) needed in a child.
subsample : float
Subsample ratio of the training instance.
subsample_freq : int
frequence of subsample, <=0 means no enable
colsample_bytree : float
Subsample ratio of columns when constructing each tree.
colsample_byleaf : float
Subsample ratio of columns when constructing each leaf.
reg_alpha : float
L1 regularization term on weights
reg_lambda : float
L2 regularization term on weights
scale_pos_weight : float
Balancing of positive and negative weights.
is_unbalance : bool
Is unbalance for binary classification
seed : int
Random number seed.
Note
----
A custom objective function can be provided for the ``objective``
parameter. In this case, it should have the signature
``objective(y_true, y_pred) -> grad, hess``:
y_true: array_like of shape [n_samples]
The target values
y_pred: array_like of shape [n_samples]
The predicted values
grad: array_like of shape [n_samples]
The value of the gradient for each sample point.
hess: array_like of shape [n_samples]
The value of the second derivative for each sample point
for multi-class task, the label/pred is group by class_id first, then group by row_id
if you want to get i-th row label/pred in j-th class, the access way is label/pred[j*num_data+i]
and you should group grad and hess in this way as well
"""
def
__init__
(
self
,
num_leaves
=
63
,
max_depth
=-
1
,
learning_rate
=
0.1
,
n_estimators
=
100
,
max_bin
=
255
,
silent
=
True
,
objective
=
"regression"
,
num_class
=
1
,
nthread
=-
1
,
gamma
=
0
,
min_child_weight
=
1
,
subsample
=
1
,
subsample_freq
=
1
,
colsample_bytree
=
1
,
colsample_byleaf
=
1
,
reg_alpha
=
0
,
reg_lambda
=
0
,
scale_pos_weight
=
1
,
is_unbalance
=
False
,
seed
=
0
):
if
not
SKLEARN_INSTALLED
:
raise
LightGBMError
(
'sklearn needs to be installed in order to use this module'
)
self
.
num_leaves
=
num_leaves
self
.
max_depth
=
max_depth
self
.
learning_rate
=
learning_rate
self
.
n_estimators
=
n_estimators
self
.
max_bin
=
max_bin
self
.
silent
=
silent
self
.
objective
=
objective
self
.
num_class
=
num_class
self
.
nthread
=
nthread
self
.
gamma
=
gamma
self
.
min_child_weight
=
min_child_weight
self
.
subsample
=
subsample
self
.
subsample_freq
=
subsample_freq
self
.
colsample_bytree
=
colsample_bytree
self
.
colsample_byleaf
=
colsample_byleaf
self
.
reg_alpha
=
reg_alpha
self
.
reg_lambda
=
reg_lambda
self
.
scale_pos_weight
=
scale_pos_weight
self
.
is_unbalance
=
is_unbalance
self
.
seed
=
seed
self
.
_Booster
=
None
def
booster
(
self
):
"""Get the underlying lightgbm Booster of this model.
This will raise an exception when fit was not called
Returns
-------
booster : a lightgbm booster of underlying model
"""
if
self
.
_Booster
is
None
:
raise
LightGBMError
(
'need to call fit beforehand'
)
return
self
.
_Booster
def
get_params
(
self
,
deep
=
False
):
"""Get parameter.s"""
params
=
super
(
LGBMModel
,
self
).
get_params
(
deep
=
deep
)
params
[
'verbose'
]
=
0
if
self
.
silent
else
1
if
self
.
nthread
<=
0
:
params
.
pop
(
'nthread'
,
None
)
return
params
def
fit
(
self
,
X
,
y
,
eval_set
=
None
,
eval_metric
=
None
,
early_stopping_rounds
=
None
,
verbose
=
True
):
"""
Fit the gradient boosting model
Parameters
----------
X : array_like
Feature matrix
y : array_like
Labels
eval_set : list, optional
A list of (X, y) tuple pairs to use as a validation set for early-stopping
eval_metric : str, list of str, callable, optional
If a str, should be a built-in evaluation metric to use. See
doc/parameter.md. If callable, a custom evaluation metric. The call
signature is func(y_predicted, y_true) where y_true will be a
Dataset fobject such that you may need to call the get_label
method. And it must return (eval_name, feature_result, is_bigger_better)
early_stopping_rounds : int
verbose : bool
If `verbose` and an evaluation set is used, writes the evaluation
metric measured on the validation set to stderr.
"""
evals_result
=
{}
params
=
self
.
get_params
()
if
callable
(
self
.
objective
):
fobj
=
_objective_decorator
(
self
.
objective
)
params
[
"objective"
]
=
"None"
else
:
fobj
=
None
if
callable
(
eval_metric
):
feval
=
eval_metric
else
:
feval
=
None
params
.
update
({
'metric'
:
eval_metric
})
feval
=
eval_metric
if
callable
(
eval_metric
)
else
None
self
.
_Booster
=
train
(
params
,
(
X
,
y
),
self
.
n_estimators
,
valid_datas
=
eval_set
,
early_stopping_rounds
=
early_stopping_rounds
,
evals_result
=
evals_result
,
fobj
=
fobj
,
feval
=
feval
,
verbose_eval
=
verbose
)
if
evals_result
:
for
val
in
evals_result
.
items
():
evals_result_key
=
list
(
val
[
1
].
keys
())[
0
]
evals_result
[
val
[
0
]][
evals_result_key
]
=
val
[
1
][
evals_result_key
]
self
.
evals_result_
=
evals_result
if
early_stopping_rounds
is
not
None
:
self
.
best_iteration
=
self
.
_Booster
.
best_iteration
return
self
def
predict
(
self
,
data
,
raw_score
=
False
,
num_iteration
=
0
):
return
self
.
booster
().
predict
(
data
,
raw_score
=
raw_score
,
num_iteration
=
num_iteration
)
def
apply
(
self
,
X
,
num_iteration
=
0
):
"""Return the predicted leaf every tree for each sample.
Parameters
----------
X : array_like, shape=[n_samples, n_features]
Input features matrix.
ntree_limit : int
Limit number of trees in the prediction; defaults to 0 (use all trees).
Returns
-------
X_leaves : array_like, shape=[n_samples, n_trees]
"""
return
self
.
booster
().
predict
(
X
,
pred_leaf
=
True
,
num_iteration
=
num_iteration
)
def
evals_result
(
self
):
"""Return the evaluation results.
Returns
-------
evals_result : dictionary
"""
if
self
.
evals_result_
:
evals_result
=
self
.
evals_result_
else
:
raise
LightGBMError
(
'No results.'
)
return
evals_result
\ No newline at end of file
src/io/config.cpp
View file @
49d85642
...
...
@@ -213,6 +213,7 @@ void ObjectiveConfig::Set(const std::unordered_map<std::string, std::string>& pa
CHECK
(
max_position
>
0
);
GetInt
(
params
,
"num_class"
,
&
num_class
);
CHECK
(
num_class
>=
1
);
GetDouble
(
params
,
"scale_pos_weight"
,
&
scale_pos_weight
);
std
::
string
tmp_str
=
""
;
if
(
GetString
(
params
,
"label_gain"
,
&
tmp_str
))
{
label_gain
=
Common
::
StringToDoubleArray
(
tmp_str
,
','
);
...
...
src/objective/binary_objective.hpp
View file @
49d85642
...
...
@@ -18,6 +18,7 @@ public:
if
(
sigmoid_
<=
0.0
)
{
Log
::
Fatal
(
"Sigmoid parameter %f should be greater than zero"
,
sigmoid_
);
}
scale_pos_weight_
=
static_cast
<
score_t
>
(
config
.
scale_pos_weight
);
}
~
BinaryLogloss
()
{}
void
Init
(
const
Metadata
&
metadata
,
data_size_t
num_data
)
override
{
...
...
@@ -55,6 +56,7 @@ public:
label_weights_
[
0
]
=
1.0
f
;
}
}
label_weights_
[
1
]
*=
scale_pos_weight_
;
}
void
GetGradients
(
const
score_t
*
score
,
score_t
*
gradients
,
score_t
*
hessians
)
const
override
{
...
...
@@ -104,6 +106,7 @@ private:
score_t
label_weights_
[
2
];
/*! \brief Weights for data */
const
float
*
weights_
;
score_t
scale_pos_weight_
;
};
}
// namespace LightGBM
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment