Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
tianlh
LightGBM-DCU
Commits
ebfc8521
Commit
ebfc8521
authored
Dec 09, 2016
by
wxchan
Committed by
Guolin Ke
Dec 09, 2016
Browse files
add an advanced example; add guide-python README.md details; clean error messages (#117)
parent
b51c7be4
Changes
8
Show whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
273 additions
and
130 deletions
+273
-130
examples/python-guide/README.md
examples/python-guide/README.md
+20
-0
examples/python-guide/advanced_example.py
examples/python-guide/advanced_example.py
+131
-0
examples/python-guide/simple_example.py
examples/python-guide/simple_example.py
+10
-5
examples/python-guide/sklearn_example.py
examples/python-guide/sklearn_example.py
+8
-3
python-package/lightgbm/basic.py
python-package/lightgbm/basic.py
+59
-77
python-package/lightgbm/callback.py
python-package/lightgbm/callback.py
+4
-4
python-package/lightgbm/engine.py
python-package/lightgbm/engine.py
+11
-11
python-package/lightgbm/sklearn.py
python-package/lightgbm/sklearn.py
+30
-30
No files found.
examples/python-guide/README.md
View file @
ebfc8521
...
...
@@ -16,3 +16,23 @@ Now you can run examples in this folder, for example:
```
python simple_example.py
```
Examples including:
-
[
simple_example.py
](
https://github.com/Microsoft/LightGBM/blob/master/examples/python-guide/simple_example.py
)
-
Construct Dataset
-
Basic train and predict
-
Eval during training
-
Early stopping
-
Save model to file
-
Dump model to json format
-
Feature importances
-
[
sklearn_example.py
](
https://github.com/Microsoft/LightGBM/blob/master/examples/python-guide/sklearn_example.py
)
-
Basic train and predict with sklearn interface
-
Feature importances with sklearn interface
-
[
advanced_example.py
](
https://github.com/Microsoft/LightGBM/blob/master/examples/python-guide/advanced_example.py
)
-
Set feature names
-
Directly use categorical features without one-hot encoding
-
Load model file to continue training
-
Change learning rates during training
-
Self-defined objective function
-
Self-defined eval metric
-
Callback function
\ No newline at end of file
examples/python-guide/advanced_example.py
0 → 100644
View file @
ebfc8521
# coding: utf-8
# pylint: disable = invalid-name, C0111
import
lightgbm
as
lgb
import
pandas
as
pd
import
numpy
as
np
# load or create your dataset
print
(
'Load data...'
)
df_train
=
pd
.
read_csv
(
'../binary_classification/binary.train'
,
header
=
None
,
sep
=
'
\t
'
)
df_test
=
pd
.
read_csv
(
'../binary_classification/binary.test'
,
header
=
None
,
sep
=
'
\t
'
)
W_train
=
pd
.
read_csv
(
'../binary_classification/binary.train.weight'
,
header
=
None
)[
0
]
W_test
=
pd
.
read_csv
(
'../binary_classification/binary.test.weight'
,
header
=
None
)[
0
]
y_train
=
df_train
[
0
]
y_test
=
df_test
[
0
]
X_train
=
df_train
.
drop
(
0
,
axis
=
1
)
X_test
=
df_test
.
drop
(
0
,
axis
=
1
)
num_train
,
num_feature
=
X_train
.
shape
# create dataset for lightgbm
# if you want to re-use data, remember to set free_raw_data=False
lgb_train
=
lgb
.
Dataset
(
X_train
,
y_train
,
weight
=
W_train
,
free_raw_data
=
False
)
lgb_eval
=
lgb
.
Dataset
(
X_test
,
y_test
,
reference
=
lgb_train
,
weight
=
W_test
,
free_raw_data
=
False
)
# specify your configurations as a dict
params
=
{
'boosting_type'
:
'gbdt'
,
'objective'
:
'binary'
,
'metric'
:
'binary_logloss'
,
'num_leaves'
:
31
,
'learning_rate'
:
0.05
,
'feature_fraction'
:
0.9
,
'bagging_fraction'
:
0.8
,
'bagging_freq'
:
5
,
'verbose'
:
0
}
# generate a feature name
feature_name
=
[
'feature_'
+
str
(
col
)
for
col
in
range
(
num_feature
)]
print
(
'Start training...'
)
# feature_name and categorical_feature
gbm
=
lgb
.
train
(
params
,
lgb_train
,
num_boost_round
=
10
,
valid_sets
=
lgb_train
,
# eval training data
feature_name
=
feature_name
,
categorical_feature
=
[
21
])
# check feature name
print
(
'Finish first 10 rounds...'
)
print
(
'7th feature name is:'
,
repr
(
lgb_train
.
feature_name
[
6
]))
# save model to file
gbm
.
save_model
(
'model.txt'
)
# continue training
# init_model accepts:
# 1. model file name
# 2. Booster()
gbm
=
lgb
.
train
(
params
,
lgb_train
,
num_boost_round
=
10
,
init_model
=
'model.txt'
,
valid_sets
=
lgb_eval
)
print
(
'Finish 10 - 20 rounds with model file...'
)
# decay learning rates
# learning_rates accepts:
# 1. list/tuple with length = num_boost_round
# 2. function(curr_iter)
# 3. function(curr_iter, total_iter)
gbm
=
lgb
.
train
(
params
,
lgb_train
,
num_boost_round
=
10
,
init_model
=
gbm
,
learning_rates
=
lambda
iter
:
0.05
*
(
0.99
**
iter
),
valid_sets
=
lgb_eval
)
print
(
'Finish 20 - 30 rounds with decay learning rates...'
)
# self-defined objective function
# f(preds: array, train_data: Dataset) -> grad: array, hess: array
# log likelihood loss
def
loglikelood
(
preds
,
train_data
):
labels
=
train_data
.
get_label
()
preds
=
1.
/
(
1.
+
np
.
exp
(
-
preds
))
grad
=
preds
-
labels
hess
=
preds
*
(
1.
-
preds
)
return
grad
,
hess
# self-defined eval metric
# f(preds: array, train_data: Dataset) -> name: string, value: array, is_higher_better: bool
# binary error
def
binary_error
(
preds
,
train_data
):
labels
=
train_data
.
get_label
()
return
'error'
,
np
.
mean
(
labels
!=
(
preds
>
0.5
)),
False
gbm
=
lgb
.
train
(
params
,
lgb_train
,
num_boost_round
=
10
,
init_model
=
gbm
,
fobj
=
loglikelood
,
feval
=
binary_error
,
valid_sets
=
lgb_eval
)
print
(
'Finish 30 - 40 rounds with self-defined objective function and eval metric...'
)
print
(
'Start a new training job...'
)
# callback
def
reset_metrics
():
def
callback
(
env
):
lgb_eval_new
=
lgb
.
Dataset
(
X_test
,
y_test
,
reference
=
lgb_train
)
if
env
.
iteration
-
env
.
begin_iteration
==
5
:
print
(
'Add a new valid dataset at iteration 5...'
)
env
.
model
.
add_valid
(
lgb_eval_new
,
'new valid'
)
callback
.
before_iteration
=
True
callback
.
order
=
0
return
callback
gbm
=
lgb
.
train
(
params
,
lgb_train
,
num_boost_round
=
10
,
valid_sets
=
lgb_train
,
callbacks
=
[
reset_metrics
()])
print
(
'Finish first 10 rounds with callback function...'
)
examples/python-guide/simple_example.py
View file @
ebfc8521
...
...
@@ -6,6 +6,7 @@ import pandas as pd
from
sklearn.metrics
import
mean_squared_error
# load or create your dataset
print
(
'Load data...'
)
df_train
=
pd
.
read_csv
(
'../regression/regression.train'
,
header
=
None
,
sep
=
'
\t
'
)
df_test
=
pd
.
read_csv
(
'../regression/regression.test'
,
header
=
None
,
sep
=
'
\t
'
)
...
...
@@ -18,7 +19,6 @@ X_test = df_test.drop(0, axis=1)
lgb_train
=
lgb
.
Dataset
(
X_train
,
y_train
)
lgb_eval
=
lgb
.
Dataset
(
X_test
,
y_test
,
reference
=
lgb_train
)
# specify your configurations as a dict
params
=
{
'task'
:
'train'
,
...
...
@@ -33,27 +33,32 @@ params = {
'verbose'
:
0
}
print
(
'Start training...'
)
# train
gbm
=
lgb
.
train
(
params
,
lgb_train
,
num_boost_round
=
10
0
,
num_boost_round
=
2
0
,
valid_sets
=
lgb_eval
,
early_stopping_rounds
=
10
)
early_stopping_rounds
=
5
)
print
(
'Save model...'
)
# save model to file
gbm
.
save_model
(
'model.txt'
)
print
(
'Start predicting...'
)
# predict
y_pred
=
gbm
.
predict
(
X_test
,
num_iteration
=
gbm
.
best_iteration
)
# eval
print
(
'The rmse of prediction is:'
,
mean_squared_error
(
y_test
,
y_pred
)
**
0.5
)
print
(
'Dump model to JSON...'
)
# dump model to json (and save to file)
model_json
=
gbm
.
dump_model
()
with
open
(
'model.json'
,
'w+'
)
as
f
:
json
.
dump
(
model_json
,
f
,
indent
=
4
)
print
(
'Calculate feature importances...'
)
# feature importances
print
(
'Feature importances:'
,
gbm
.
feature_importance
())
print
(
'Feature importances:'
,
gbm
.
feature_importance
(
"gain"
))
print
(
'Feature importances:'
,
list
(
gbm
.
feature_importance
())
)
#
print('Feature importances:',
list(
gbm.feature_importance("gain"))
)
examples/python-guide/sklearn_example.py
View file @
ebfc8521
...
...
@@ -5,6 +5,7 @@ import pandas as pd
from
sklearn.metrics
import
mean_squared_error
# load or create your dataset
print
(
'Load data...'
)
df_train
=
pd
.
read_csv
(
'../regression/regression.train'
,
header
=
None
,
sep
=
'
\t
'
)
df_test
=
pd
.
read_csv
(
'../regression/regression.test'
,
header
=
None
,
sep
=
'
\t
'
)
...
...
@@ -13,19 +14,23 @@ y_test = df_test[0]
X_train
=
df_train
.
drop
(
0
,
axis
=
1
)
X_test
=
df_test
.
drop
(
0
,
axis
=
1
)
print
(
'Start training...'
)
# train
gbm
=
lgb
.
LGBMRegressor
(
objective
=
'regression'
,
num_leaves
=
31
,
learning_rate
=
0.05
,
n_estimators
=
10
0
)
n_estimators
=
2
0
)
gbm
.
fit
(
X_train
,
y_train
,
eval_set
=
[(
X_test
,
y_test
)],
early_stopping_rounds
=
10
)
eval_metric
=
'l1'
,
early_stopping_rounds
=
5
)
print
(
'Start predicting...'
)
# predict
y_pred
=
gbm
.
predict
(
X_test
,
num_iteration
=
gbm
.
best_iteration
)
# eval
print
(
'The rmse of prediction is:'
,
mean_squared_error
(
y_test
,
y_pred
)
**
0.5
)
print
(
'Calculate feature importances...'
)
# feature importances
print
(
'Feature importances:'
,
gbm
.
feature_importance
())
print
(
'Feature importances:'
,
list
(
gbm
.
feature_importance
())
)
python-package/lightgbm/basic.py
View file @
ebfc8521
# coding: utf-8
# pylint: disable = invalid-name, C0111, C0301, R0912, R0913, R0914, W0105
# pylint: disable = invalid-name, C0111, C0301
# pylint: disable = R0912, R0913, R0914, W0105, W0201, W0212
# pylint: disable = E1101
"""Wrapper c_api of LightGBM"""
from
__future__
import
absolute_import
...
...
@@ -17,13 +18,11 @@ from .libpath import find_lib_path
"""pandas"""
try
:
from
pandas
import
Series
,
DataFrame
IS_PANDAS_INSTALLED
=
True
except
ImportError
:
class
Series
(
object
):
pass
class
DataFrame
(
object
):
pass
IS_PANDAS_INSTALLED
=
False
IS_PY3
=
(
sys
.
version_info
[
0
]
==
3
)
...
...
@@ -72,7 +71,7 @@ def is_1d_list(data):
return
isinstance
(
data
,
list
)
and
\
(
not
data
or
isinstance
(
data
[
0
],
(
int
,
float
,
bool
)))
def
list_to_1d_numpy
(
data
,
dtype
):
def
list_to_1d_numpy
(
data
,
dtype
=
np
.
float32
,
name
=
'list'
):
"""convert to 1d numpy array"""
if
is_numpy_1d_array
(
data
):
if
data
.
dtype
==
dtype
:
...
...
@@ -81,28 +80,26 @@ def list_to_1d_numpy(data, dtype):
return
data
.
astype
(
dtype
=
dtype
,
copy
=
False
)
elif
is_1d_list
(
data
):
return
np
.
array
(
data
,
dtype
=
dtype
,
copy
=
False
)
elif
IS_PANDAS_INSTALLED
and
isinstance
(
data
,
Series
):
return
data
.
astype
(
dtype
)
.
values
elif
isinstance
(
data
,
Series
):
return
data
.
values
.
astype
(
dtype
)
else
:
raise
TypeError
(
"
Unknow
type({})"
.
format
(
type
(
data
).
__name__
))
raise
TypeError
(
"
Wrong
type({})
for {}, should be list or numpy array
"
.
format
(
type
(
data
).
__name__
,
name
))
def
cfloat32_array_to_numpy
(
cptr
,
length
):
"""Convert a ctypes float pointer array to a numpy array.
"""
if
isinstance
(
cptr
,
ctypes
.
POINTER
(
ctypes
.
c_float
)):
res
=
np
.
fromiter
(
cptr
,
dtype
=
np
.
float32
,
count
=
length
)
return
res
return
np
.
fromiter
(
cptr
,
dtype
=
np
.
float32
,
count
=
length
)
else
:
raise
RuntimeError
(
'
e
xpected float pointer'
)
raise
RuntimeError
(
'
E
xpected float pointer'
)
def
cint32_array_to_numpy
(
cptr
,
length
):
"""Convert a ctypes float pointer array to a numpy array.
"""
if
isinstance
(
cptr
,
ctypes
.
POINTER
(
ctypes
.
c_int32
)):
res
=
np
.
fromiter
(
cptr
,
dtype
=
np
.
int32
,
count
=
length
)
return
res
return
np
.
fromiter
(
cptr
,
dtype
=
np
.
int32
,
count
=
length
)
else
:
raise
RuntimeError
(
'
e
xpected int pointer'
)
raise
RuntimeError
(
'
E
xpected int pointer'
)
def
c_str
(
string
):
"""Convert a python string to cstring."""
...
...
@@ -113,7 +110,7 @@ def c_array(ctype, values):
return
(
ctype
*
len
(
values
))(
*
values
)
def
param_dict_to_str
(
data
):
if
not
data
:
if
data
is
None
or
not
data
:
return
""
pairs
=
[]
for
key
,
val
in
data
.
items
():
...
...
@@ -122,7 +119,7 @@ def param_dict_to_str(data):
elif
isinstance
(
val
,
(
list
,
tuple
,
set
)):
pairs
.
append
(
str
(
key
)
+
'='
+
','
.
join
(
map
(
str
,
val
)))
else
:
raise
TypeError
(
'
u
nknow type of parameter:%s
, got:%s'
raise
TypeError
(
'
U
nknow
n
type of parameter:%s, got:%s'
%
(
key
,
type
(
val
).
__name__
))
return
' '
.
join
(
pairs
)
...
...
@@ -158,10 +155,10 @@ def c_float_array(data):
ptr_data
=
data
.
ctypes
.
data_as
(
ctypes
.
POINTER
(
ctypes
.
c_double
))
type_data
=
C_API_DTYPE_FLOAT64
else
:
raise
TypeError
(
"
e
xpected np.float32 or np.float64, met type({})"
raise
TypeError
(
"
E
xpected np.float32 or np.float64, met type({})"
.
format
(
data
.
dtype
))
else
:
raise
TypeError
(
"Unknow type({})"
.
format
(
type
(
data
).
__name__
))
raise
TypeError
(
"Unknow
n
type({})"
.
format
(
type
(
data
).
__name__
))
return
(
ptr_data
,
type_data
)
def
c_int_array
(
data
):
...
...
@@ -176,10 +173,10 @@ def c_int_array(data):
ptr_data
=
data
.
ctypes
.
data_as
(
ctypes
.
POINTER
(
ctypes
.
c_int64
))
type_data
=
C_API_DTYPE_INT64
else
:
raise
TypeError
(
"
e
xpected np.int32 or np.int64, met type({})"
raise
TypeError
(
"
E
xpected np.int32 or np.int64, met type({})"
.
format
(
data
.
dtype
))
else
:
raise
TypeError
(
"Unknow type({})"
.
format
(
type
(
data
).
__name__
))
raise
TypeError
(
"Unknow
n
type({})"
.
format
(
type
(
data
).
__name__
))
return
(
ptr_data
,
type_data
)
class
_InnerPredictor
(
object
):
...
...
@@ -261,7 +258,7 @@ class _InnerPredictor(object):
Prediction result
"""
if
isinstance
(
data
,
(
_InnerDataset
,
Dataset
)):
raise
TypeError
(
"
c
annot use Dataset instance for prediction, please use raw data instead"
)
raise
TypeError
(
"
C
annot use Dataset instance for prediction, please use raw data instead"
)
predict_type
=
C_API_PREDICT_NORMAL
if
raw_score
:
predict_type
=
C_API_PREDICT_RAW_SCORE
...
...
@@ -290,7 +287,7 @@ class _InnerPredictor(object):
elif
isinstance
(
data
,
np
.
ndarray
):
preds
,
nrow
=
self
.
__pred_for_np2d
(
data
,
num_iteration
,
predict_type
)
elif
IS_PANDAS_INSTALLED
and
isinstance
(
data
,
DataFrame
):
elif
isinstance
(
data
,
DataFrame
):
preds
,
nrow
=
self
.
__pred_for_np2d
(
data
.
values
,
num_iteration
,
predict_type
)
else
:
...
...
@@ -299,15 +296,14 @@ class _InnerPredictor(object):
preds
,
nrow
=
self
.
__pred_for_csr
(
csr
,
num_iteration
,
predict_type
)
except
:
raise
TypeError
(
'can not predict data for type {}'
.
format
(
type
(
data
).
__name__
))
raise
TypeError
(
'Cannot predict data for type {}'
.
format
(
type
(
data
).
__name__
))
if
pred_leaf
:
preds
=
preds
.
astype
(
np
.
int32
)
if
is_reshape
and
preds
.
size
!=
nrow
:
if
preds
.
size
%
nrow
==
0
:
preds
=
preds
.
reshape
(
nrow
,
-
1
)
else
:
raise
ValueError
(
'
l
ength of predict result (%d) cannot be divide nrow (%d)'
raise
ValueError
(
'
L
ength of predict result (%d) cannot be divide nrow (%d)'
%
(
preds
.
size
,
nrow
))
return
preds
...
...
@@ -353,7 +349,7 @@ class _InnerPredictor(object):
preds
.
ctypes
.
data_as
(
ctypes
.
POINTER
(
ctypes
.
c_float
))
))
if
n_preds
!=
out_num_preds
.
value
:
raise
ValueError
(
"
incorrect number
for predict result"
)
raise
ValueError
(
"
Wrong length
for predict result
s
"
)
return
preds
,
mat
.
shape
[
0
]
def
__pred_for_csr
(
self
,
csr
,
num_iteration
,
predict_type
):
...
...
@@ -384,7 +380,7 @@ class _InnerPredictor(object):
preds
.
ctypes
.
data_as
(
ctypes
.
POINTER
(
ctypes
.
c_float
))
))
if
n_preds
!=
out_num_preds
.
value
:
raise
ValueError
(
"
incorrect number
for predict result"
)
raise
ValueError
(
"
Wrong length
for predict result
s
"
)
return
preds
,
nrow
PANDAS_DTYPE_MAPPER
=
{
'int8'
:
'int'
,
'int16'
:
'int'
,
'int32'
:
'int'
,
...
...
@@ -481,10 +477,10 @@ class _InnerDataset(object):
elif
isinstance
(
name
,
int
):
categorical_indices
.
add
(
name
)
else
:
raise
TypeError
(
"
unknown
type({}) or unknown name({}) in categorical_feature"
\
raise
TypeError
(
"
Wrong
type({}) or unknown name({}) in categorical_feature"
\
.
format
(
type
(
name
).
__name__
,
name
))
params
[
'categorical_column'
]
=
categorical_indices
params
[
'categorical_column'
]
=
sorted
(
categorical_indices
)
params_str
=
param_dict_to_str
(
params
)
"""process for reference dataset"""
...
...
@@ -514,11 +510,11 @@ class _InnerDataset(object):
csr
=
scipy
.
sparse
.
csr_matrix
(
data
)
self
.
__init_from_csr
(
csr
,
params_str
,
ref_dataset
)
except
:
raise
TypeError
(
'
c
an
not initialize _InnerDataset from {}'
.
format
(
type
(
data
).
__name__
))
raise
TypeError
(
'
C
annot initialize _InnerDataset from {}'
.
format
(
type
(
data
).
__name__
))
if
label
is
not
None
:
self
.
set_label
(
label
)
if
self
.
get_label
()
is
None
:
raise
ValueError
(
"
l
abel should not be None"
)
raise
ValueError
(
"
L
abel should not be None"
)
if
weight
is
not
None
:
self
.
set_weight
(
weight
)
if
group
is
not
None
:
...
...
@@ -572,7 +568,7 @@ class _InnerDataset(object):
"""
Get subset of current dataset
"""
used_indices
=
list_to_1d_numpy
(
used_indices
,
np
.
int32
)
used_indices
=
list_to_1d_numpy
(
used_indices
,
np
.
int32
,
name
=
'used_indices'
)
ret
=
_InnerDataset
(
None
)
ret
.
handle
=
ctypes
.
c_void_p
()
params_str
=
param_dict_to_str
(
params
)
...
...
@@ -585,7 +581,7 @@ class _InnerDataset(object):
ret
.
max_bin
=
self
.
max_bin
ret
.
predictor
=
self
.
predictor
if
ret
.
get_label
()
is
None
:
raise
ValueError
(
"
l
abel should not be None"
)
raise
ValueError
(
"
L
abel should not be None"
)
return
ret
def
set_feature_name
(
self
,
feature_name
):
...
...
@@ -595,7 +591,7 @@ class _InnerDataset(object):
if
feature_name
is
None
:
return
if
len
(
feature_name
)
!=
self
.
num_feature
():
raise
ValueError
(
"
size
of feature_name
error"
)
raise
ValueError
(
"
Length
of feature_name
({}) and num_feature({}) don't match"
.
format
(
len
(
feature_name
),
self
.
num_feature
())
)
c_feature_name
=
[
c_str
(
name
)
for
name
in
feature_name
]
_safe_call
(
_LIB
.
LGBM_DatasetSetFeatureNames
(
self
.
handle
,
...
...
@@ -632,7 +628,7 @@ class _InnerDataset(object):
Initialize data from a CSR matrix.
"""
if
len
(
csr
.
indices
)
!=
len
(
csr
.
data
):
raise
ValueError
(
'
l
ength mismatch: {} vs {}'
.
format
(
len
(
csr
.
indices
),
len
(
csr
.
data
)))
raise
ValueError
(
'
L
ength mismatch: {} vs {}'
.
format
(
len
(
csr
.
indices
),
len
(
csr
.
data
)))
self
.
handle
=
ctypes
.
c_void_p
()
ptr_indptr
,
type_ptr_indptr
=
c_int_array
(
csr
.
indptr
)
...
...
@@ -685,7 +681,7 @@ class _InnerDataset(object):
elif
out_type
.
value
==
C_API_DTYPE_FLOAT32
:
return
cfloat32_array_to_numpy
(
ctypes
.
cast
(
ret
,
ctypes
.
POINTER
(
ctypes
.
c_float
)),
tmp_out_len
.
value
)
else
:
raise
TypeError
(
"
u
nknow type"
)
raise
TypeError
(
"
U
nknow
n
type"
)
def
set_field
(
self
,
field_name
,
data
):
"""Set property into the _InnerDataset.
...
...
@@ -707,11 +703,8 @@ class _InnerDataset(object):
0
,
FIELD_TYPE_MAPPER
[
field_name
]))
return
if
IS_PANDAS_INSTALLED
and
isinstance
(
data
,
Series
):
dtype
=
np
.
int32
if
field_name
==
'group'
else
np
.
float32
data
=
data
.
astype
(
dtype
).
values
if
not
is_numpy_1d_array
(
data
):
raise
TypeError
(
"Unknow type({})"
.
format
(
type
(
data
).
__name__
))
data
=
list_to_1d_numpy
(
data
,
dtype
,
name
=
field_name
)
if
data
.
dtype
==
np
.
float32
:
ptr_data
=
data
.
ctypes
.
data_as
(
ctypes
.
POINTER
(
ctypes
.
c_float
))
type_data
=
C_API_DTYPE_FLOAT32
...
...
@@ -719,9 +712,9 @@ class _InnerDataset(object):
ptr_data
=
data
.
ctypes
.
data_as
(
ctypes
.
POINTER
(
ctypes
.
c_int32
))
type_data
=
C_API_DTYPE_INT32
else
:
raise
TypeError
(
"
e
xcepted np.float32 or np.int32, met type({})"
.
format
(
data
.
dtype
))
raise
TypeError
(
"
E
xcepted np.float32 or np.int32, me
e
t type({})"
.
format
(
data
.
dtype
))
if
type_data
!=
FIELD_TYPE_MAPPER
[
field_name
]:
raise
TypeError
(
"type error for set_field"
)
raise
TypeError
(
"
Input
type error for set_field"
)
_safe_call
(
_LIB
.
LGBM_DatasetSetField
(
self
.
handle
,
c_str
(
field_name
),
...
...
@@ -749,7 +742,7 @@ class _InnerDataset(object):
label: numpy array or list or None
The label information to be set into _InnerDataset
"""
label
=
list_to_1d_numpy
(
label
,
n
p
.
float32
)
label
=
list_to_1d_numpy
(
label
,
n
ame
=
'label'
)
self
.
set_field
(
'label'
,
label
)
def
set_weight
(
self
,
weight
):
...
...
@@ -761,7 +754,7 @@ class _InnerDataset(object):
Weight for each data point
"""
if
weight
is
not
None
:
weight
=
list_to_1d_numpy
(
weight
,
n
p
.
float32
)
weight
=
list_to_1d_numpy
(
weight
,
n
ame
=
'weight'
)
self
.
set_field
(
'weight'
,
weight
)
def
set_init_score
(
self
,
score
):
...
...
@@ -773,7 +766,7 @@ class _InnerDataset(object):
Init score for booster
"""
if
score
is
not
None
:
score
=
list_to_1d_numpy
(
score
,
n
p
.
float32
)
score
=
list_to_1d_numpy
(
score
,
n
ame
=
'init score'
)
self
.
set_field
(
'init_score'
,
score
)
def
set_group
(
self
,
group
):
...
...
@@ -785,7 +778,7 @@ class _InnerDataset(object):
Group size of each group
"""
if
group
is
not
None
:
group
=
list_to_1d_numpy
(
group
,
np
.
int32
)
group
=
list_to_1d_numpy
(
group
,
np
.
int32
,
name
=
'group'
)
self
.
set_field
(
'group'
,
group
)
def
get_label
(
self
):
...
...
@@ -941,7 +934,8 @@ class Dataset(object):
else
:
self
.
inner_dataset
=
_InnerDataset
(
self
.
data
,
self
.
label
,
self
.
max_bin
,
None
,
self
.
weight
,
self
.
group
,
self
.
_predictor
,
self
.
silent
,
self
.
feature_name
,
self
.
categorical_feature
,
self
.
params
)
self
.
silent
,
self
.
feature_name
,
self
.
categorical_feature
,
self
.
params
)
if
self
.
free_raw_data
:
self
.
data
=
None
...
...
@@ -994,7 +988,7 @@ class Dataset(object):
Parameters
----------
reference : Dataset
w
ill use reference as template to consturct current dataset
W
ill use reference as template to consturct current dataset
"""
self
.
set_categorical_feature
(
reference
.
categorical_feature
)
self
.
set_feature_name
(
reference
.
feature_name
)
...
...
@@ -1015,7 +1009,7 @@ class Dataset(object):
Parameters
----------
feature_name : list of str
f
eature names
F
eature names
"""
self
.
feature_name
=
feature_name
if
self
.
__is_constructed
():
...
...
@@ -1028,9 +1022,9 @@ class Dataset(object):
Parameters
----------
used_indices : list of int
u
se indices of this subset
U
se
d
indices of this subset
params : dict
o
ther parameters
O
ther parameters
"""
ret
=
Dataset
(
None
)
ret
.
feature_name
=
self
.
feature_name
...
...
@@ -1198,7 +1192,7 @@ class Booster(object):
if
train_set
is
not
None
:
"""Training task"""
if
not
isinstance
(
train_set
,
Dataset
):
raise
TypeError
(
'
t
raining data should be Dataset instance, met {}'
.
format
(
type
(
train_set
).
__name__
))
raise
TypeError
(
'
T
raining data should be Dataset instance, met {}'
.
format
(
type
(
train_set
).
__name__
))
params_str
=
param_dict_to_str
(
params
)
"""construct booster object"""
_safe_call
(
_LIB
.
LGBM_BoosterCreate
(
...
...
@@ -1237,7 +1231,7 @@ class Booster(object):
ctypes
.
byref
(
out_num_class
)))
self
.
__num_class
=
out_num_class
.
value
else
:
raise
TypeError
(
'
A
t least ne
ed
training dataset or model file to create booster instance'
)
raise
TypeError
(
'
Need a
t least
o
ne training dataset or model file to create booster instance'
)
def
__del__
(
self
):
if
self
.
handle
is
not
None
:
...
...
@@ -1342,22 +1336,10 @@ class Booster(object):
-------
is_finished, bool
"""
if
not
is_numpy_1d_array
(
grad
):
if
is_1d_list
(
grad
):
grad
=
np
.
array
(
grad
,
dtype
=
np
.
float32
,
copy
=
False
)
else
:
raise
TypeError
(
"grad should be numpy 1d array or 1d list"
)
if
not
is_numpy_1d_array
(
hess
):
if
is_1d_list
(
hess
):
hess
=
np
.
array
(
hess
,
dtype
=
np
.
float32
,
copy
=
False
)
else
:
raise
TypeError
(
"hess should be numpy 1d array or 1d list"
)
grad
=
list_to_1d_numpy
(
grad
,
name
=
'gradient'
)
hess
=
list_to_1d_numpy
(
hess
,
name
=
'hessian'
)
if
len
(
grad
)
!=
len
(
hess
):
raise
ValueError
(
'grad / hess lengths mismatch: {} / {}'
.
format
(
len
(
grad
),
len
(
hess
)))
if
grad
.
dtype
!=
np
.
float32
:
grad
=
grad
.
astype
(
np
.
float32
,
copy
=
False
)
if
hess
.
dtype
!=
np
.
float32
:
hess
=
hess
.
astype
(
np
.
float32
,
copy
=
False
)
raise
ValueError
(
"Lengths of gradient({}) and hessian({}) don't match"
.
format
(
len
(
grad
),
len
(
hess
)))
is_finished
=
ctypes
.
c_int
(
0
)
_safe_call
(
_LIB
.
LGBM_BoosterUpdateOneIterCustom
(
self
.
handle
,
...
...
@@ -1548,7 +1530,7 @@ class Booster(object):
Evaulate training or validation data
"""
if
data_idx
>=
self
.
__num_dataset
:
raise
ValueError
(
"
d
ata_idx should be smaller than number of dataset"
)
raise
ValueError
(
"
D
ata_idx should be smaller than number of dataset"
)
self
.
__get_eval_info
()
ret
=
[]
if
self
.
__num_inner_eval
>
0
:
...
...
@@ -1560,7 +1542,7 @@ class Booster(object):
ctypes
.
byref
(
tmp_out_len
),
result
.
ctypes
.
data_as
(
ctypes
.
POINTER
(
ctypes
.
c_float
))))
if
tmp_out_len
.
value
!=
self
.
__num_inner_eval
:
raise
ValueError
(
"
incorrect number
of eval results"
)
raise
ValueError
(
"
Wrong length
of eval results"
)
for
i
in
range
(
self
.
__num_inner_eval
):
ret
.
append
((
data_name
,
self
.
__name_inner_eval
[
i
],
result
[
i
],
self
.
__higher_better_inner_eval
[
i
]))
if
feval
is
not
None
:
...
...
@@ -1582,7 +1564,7 @@ class Booster(object):
Predict for training and validation dataset
"""
if
data_idx
>=
self
.
__num_dataset
:
raise
ValueError
(
"
d
ata_idx should be smaller than number of dataset"
)
raise
ValueError
(
"
D
ata_idx should be smaller than number of dataset"
)
if
self
.
__inner_predict_buffer
[
data_idx
]
is
None
:
if
data_idx
==
0
:
n_preds
=
self
.
train_set
.
num_data
()
*
self
.
__num_class
...
...
@@ -1600,7 +1582,7 @@ class Booster(object):
ctypes
.
byref
(
tmp_out_len
),
data_ptr
))
if
tmp_out_len
.
value
!=
len
(
self
.
__inner_predict_buffer
[
data_idx
]):
raise
ValueError
(
"
incorrect number
of predict results for data %d"
%
(
data_idx
))
raise
ValueError
(
"
Wrong length
of predict results for data %d"
%
(
data_idx
))
self
.
__is_predicted_cur_iter
[
data_idx
]
=
True
return
self
.
__inner_predict_buffer
[
data_idx
]
...
...
@@ -1626,7 +1608,7 @@ class Booster(object):
ctypes
.
byref
(
tmp_out_len
),
ptr_string_buffers
))
if
self
.
__num_inner_eval
!=
tmp_out_len
.
value
:
raise
ValueError
(
"
size
of eval names doesn't equal with num_evals"
)
raise
ValueError
(
"
Length
of eval names doesn't equal with num_evals"
)
self
.
__name_inner_eval
=
\
[
string_buffers
[
i
].
value
.
decode
()
for
i
in
range
(
self
.
__num_inner_eval
)]
self
.
__higher_better_inner_eval
=
\
...
...
@@ -1658,7 +1640,7 @@ class Booster(object):
for
key
,
value
in
kwargs
.
items
():
if
value
is
not
None
:
if
not
is_str
(
value
):
raise
ValueError
(
"
s
et
_
attr only accepts string
value
s"
)
raise
ValueError
(
"
S
et
attr only accepts strings"
)
self
.
__attr
[
key
]
=
value
else
:
self
.
__attr
.
pop
(
key
,
None
)
python-package/lightgbm/callback.py
View file @
ebfc8521
...
...
@@ -35,7 +35,7 @@ def _format_eval_result(value, show_stdv=True):
else
:
return
'%s
\'
s %s:%g'
%
(
value
[
0
],
value
[
1
],
value
[
2
])
else
:
raise
ValueError
(
"
w
rong metric value"
)
raise
ValueError
(
"
W
rong metric value"
)
def
print_evaluation
(
period
=
1
,
show_stdv
=
True
):
...
...
@@ -80,7 +80,7 @@ def record_evaluation(eval_result):
The requested callback function.
"""
if
not
isinstance
(
eval_result
,
dict
):
raise
TypeError
(
'
e
val_result
has to
be a dictionary'
)
raise
TypeError
(
'
E
val_result
should
be a dictionary'
)
eval_result
.
clear
()
def
init
(
env
):
...
...
@@ -164,7 +164,7 @@ def early_stop(stopping_rounds, verbose=True):
def
init
(
env
):
"""internal function"""
if
not
env
.
evaluation_result_list
:
raise
ValueError
(
'For early stopping
you need
at least one set i
n evals.
'
)
raise
ValueError
(
'For early stopping
,
at least one
data
set i
s required for evaluation
'
)
if
verbose
:
msg
=
"Train until valid scores didn't improve in {} rounds."
...
...
@@ -194,7 +194,7 @@ def early_stop(stopping_rounds, verbose=True):
if
env
.
model
is
not
None
:
env
.
model
.
set_attr
(
best_iteration
=
str
(
best_iter
[
i
]))
if
verbose
:
print
(
'
e
arly stopping, best iteration is:'
)
print
(
'
E
arly stopping, best iteration is:'
)
print
(
best_msg
[
i
])
raise
EarlyStopException
(
best_iter
[
i
])
callback
.
order
=
30
...
...
python-package/lightgbm/engine.py
View file @
ebfc8521
...
...
@@ -85,10 +85,10 @@ def train(params, train_set, num_boost_round=100,
predictor
=
init_model
.
_to_predictor
()
else
:
predictor
=
None
init_iteration
=
predictor
.
num_total_iteration
if
predictor
else
0
init_iteration
=
predictor
.
num_total_iteration
if
predictor
is
not
None
else
0
"""check dataset"""
if
not
isinstance
(
train_set
,
Dataset
):
raise
TypeError
(
"only
can
accept Dataset
instance for traninig
"
)
raise
TypeError
(
"
Traninig
only accept
s
Dataset
object
"
)
train_set
.
_set_predictor
(
predictor
)
train_set
.
set_feature_name
(
feature_name
)
...
...
@@ -98,7 +98,7 @@ def train(params, train_set, num_boost_round=100,
train_data_name
=
"training"
reduced_valid_sets
=
[]
name_valid_sets
=
[]
if
valid_sets
:
if
valid_sets
is
not
None
:
if
isinstance
(
valid_sets
,
Dataset
):
valid_sets
=
[
valid_sets
]
if
isinstance
(
valid_names
,
str
):
...
...
@@ -111,7 +111,7 @@ def train(params, train_set, num_boost_round=100,
train_data_name
=
valid_names
[
i
]
continue
if
not
isinstance
(
valid_data
,
Dataset
):
raise
TypeError
(
"only
can
accept Dataset
instance for traninig
"
)
raise
TypeError
(
"
Traninig
only accept
s
Dataset
object
"
)
valid_data
.
set_reference
(
train_set
)
reduced_valid_sets
.
append
(
valid_data
)
if
valid_names
is
not
None
and
len
(
valid_names
)
>
i
:
...
...
@@ -120,7 +120,7 @@ def train(params, train_set, num_boost_round=100,
name_valid_sets
.
append
(
'valid_'
+
str
(
i
))
"""process callbacks"""
if
not
callbacks
:
if
callbacks
is
None
:
callbacks
=
set
()
else
:
for
i
,
cb
in
enumerate
(
callbacks
):
...
...
@@ -133,7 +133,7 @@ def train(params, train_set, num_boost_round=100,
elif
isinstance
(
verbose_eval
,
int
):
callbacks
.
add
(
callback
.
print_evaluation
(
verbose_eval
))
if
early_stopping_rounds
:
if
early_stopping_rounds
is
not
None
:
callbacks
.
add
(
callback
.
early_stop
(
early_stopping_rounds
,
verbose
=
bool
(
verbose_eval
)))
...
...
@@ -169,7 +169,7 @@ def train(params, train_set, num_boost_round=100,
evaluation_result_list
=
[]
# check evaluation result.
if
valid_sets
:
if
valid_sets
is
not
None
:
if
is_valid_contain_train
:
evaluation_result_list
.
extend
(
booster
.
eval_train
(
feval
))
evaluation_result_list
.
extend
(
booster
.
eval_valid
(
feval
))
...
...
@@ -227,7 +227,7 @@ def _make_n_folds(full_data, nfold, params, seed, fpreproc=None, stratified=Fals
sfk
=
StratifiedKFold
(
n_splits
=
nfold
,
shuffle
=
True
,
random_state
=
seed
)
idset
=
[
x
[
1
]
for
x
in
sfk
.
split
(
X
=
full_data
.
get_label
(),
y
=
full_data
.
get_label
())]
else
:
raise
LightGBMError
(
'
sklearn needs to be installed in order to use
stratified cv'
)
raise
LightGBMError
(
'
Scikit-learn is required for
stratified cv'
)
else
:
full_data
.
construct
()
randidx
=
np
.
random
.
permutation
(
full_data
.
num_data
())
...
...
@@ -318,7 +318,7 @@ def cv(params, train_set, num_boost_round=10, nfold=5, stratified=False,
evaluation history : list(string)
"""
if
not
isinstance
(
train_set
,
Dataset
):
raise
TypeError
(
"only
can
accept Dataset
instance for traninig
"
)
raise
TypeError
(
"
Traninig
only accept
s
Dataset
object
"
)
if
is_str
(
init_model
):
predictor
=
_InnerPredictor
(
model_file
=
init_model
)
...
...
@@ -342,13 +342,13 @@ def cv(params, train_set, num_boost_round=10, nfold=5, stratified=False,
cvfolds
=
_make_n_folds
(
train_set
,
nfold
,
params
,
seed
,
fpreproc
,
stratified
)
# setup callbacks
if
not
callbacks
:
if
callbacks
is
None
:
callbacks
=
set
()
else
:
for
i
,
cb
in
enumerate
(
callbacks
):
cb
.
__dict__
.
setdefault
(
'order'
,
i
-
len
(
callbacks
))
callbacks
=
set
(
callbacks
)
if
early_stopping_rounds
:
if
early_stopping_rounds
is
not
None
:
callbacks
.
add
(
callback
.
early_stop
(
early_stopping_rounds
,
verbose
=
False
))
if
verbose_eval
is
True
:
callbacks
.
add
(
callback
.
print_evaluation
(
show_stdv
=
show_stdv
))
...
...
python-package/lightgbm/sklearn.py
View file @
ebfc8521
...
...
@@ -6,7 +6,7 @@ from __future__ import absolute_import
import
numpy
as
np
from
.basic
import
LightGBMError
,
Dataset
,
is_str
from
.engine
import
train
#
sklearn
'''
sklearn
'''
try
:
from
sklearn.base
import
BaseEstimator
from
sklearn.base
import
RegressorMixin
,
ClassifierMixin
...
...
@@ -38,7 +38,6 @@ def _point_wise_objective(func):
y_pred: array_like of shape [n_samples] or shape[n_samples* n_class] (for multi-class)
The predicted values
Returns
-------
new_func: callable
...
...
@@ -66,7 +65,7 @@ def _point_wise_objective(func):
num_data
=
len
(
weight
)
num_class
=
len
(
grad
)
//
num_data
if
num_class
*
num_data
!=
len
(
grad
):
raise
ValueError
(
"
l
ength of grad and hess should equal to num_class * num_data"
)
raise
ValueError
(
"
L
ength of grad and hess should equal to num_class * num_data"
)
for
k
in
range
(
num_class
):
for
i
in
range
(
num_data
):
idx
=
k
*
num_data
+
i
...
...
@@ -147,7 +146,7 @@ class LGBMModel(LGBMModelBase):
reg_alpha
=
0
,
reg_lambda
=
0
,
scale_pos_weight
=
1
,
is_unbalance
=
False
,
seed
=
0
):
if
not
SKLEARN_INSTALLED
:
raise
LightGBMError
(
'
sklearn needs to be installed in order to use
this module'
)
raise
LightGBMError
(
'
Scikit-learn is required for
this module'
)
self
.
num_leaves
=
num_leaves
self
.
max_depth
=
max_depth
...
...
@@ -185,7 +184,7 @@ class LGBMModel(LGBMModelBase):
booster : a lightgbm booster of underlying model
"""
if
self
.
_Booster
is
None
:
raise
LightGBMError
(
'
n
eed to call fit beforehand'
)
raise
LightGBMError
(
'
N
eed to call fit beforehand'
)
return
self
.
_Booster
def
get_params
(
self
,
deep
=
False
):
...
...
@@ -343,7 +342,7 @@ class LGBMModel(LGBMModelBase):
if
self
.
evals_result_
:
evals_result
=
self
.
evals_result_
else
:
raise
LightGBMError
(
'No results.'
)
raise
LightGBMError
(
'No results
found
.'
)
return
evals_result
...
...
@@ -390,8 +389,8 @@ class LGBMClassifier(LGBMModel, LGBMClassifierBase):
is_unbalance
=
False
,
seed
=
0
):
super
(
LGBMClassifier
,
self
).
__init__
(
num_leaves
,
max_depth
,
learning_rate
,
n_estimators
,
max_bin
,
silent
,
objective
,
nthread
,
min_split_gain
,
min_child_weight
,
min_child_samples
,
silent
,
objective
,
nthread
,
min_split_gain
,
min_child_weight
,
min_child_samples
,
subsample
,
subsample_freq
,
colsample_bytree
,
reg_alpha
,
reg_lambda
,
scale_pos_weight
,
is_unbalance
,
seed
)
...
...
@@ -480,7 +479,7 @@ def _group_wise_objective(func):
labels
=
dataset
.
get_label
()
group
=
dataset
.
get_group
()
if
group
is
None
:
raise
ValueError
(
"
g
roup should not be None for ranking task"
)
raise
ValueError
(
"
G
roup should not be None for ranking task"
)
grad
,
hess
=
func
(
labels
,
group
,
preds
)
"""weighted for objective"""
weight
=
dataset
.
get_weight
()
...
...
@@ -490,7 +489,7 @@ def _group_wise_objective(func):
grad
=
np
.
multiply
(
grad
,
weight
)
hess
=
np
.
multiply
(
hess
,
weight
)
else
:
raise
ValueError
(
"
l
eng
h
t of grad and hess should equal with num_data"
)
raise
ValueError
(
"
L
engt
h
of grad and hess should equal with num_data"
)
return
grad
,
hess
return
inner
...
...
@@ -508,8 +507,8 @@ class LGBMRanker(LGBMModel):
is_unbalance
=
False
,
seed
=
0
):
super
(
LGBMRanker
,
self
).
__init__
(
num_leaves
,
max_depth
,
learning_rate
,
n_estimators
,
max_bin
,
silent
,
objective
,
nthread
,
min_split_gain
,
min_child_weight
,
min_child_samples
,
silent
,
objective
,
nthread
,
min_split_gain
,
min_child_weight
,
min_child_samples
,
subsample
,
subsample_freq
,
colsample_bytree
,
reg_alpha
,
reg_lambda
,
scale_pos_weight
,
is_unbalance
,
seed
)
...
...
@@ -535,17 +534,18 @@ class LGBMRanker(LGBMModel):
"""check group data"""
if
group
is
None
:
raise
ValueError
(
"
s
hould
u
se group for ranking task"
)
raise
ValueError
(
"
S
hould se
t
group for ranking task"
)
if
eval_set
is
not
None
:
if
eval_group
is
None
:
raise
ValueError
(
"
e
val_group cannot be None when eval_set is not None"
)
raise
ValueError
(
"
E
val_group cannot be None when eval_set is not None"
)
elif
len
(
eval_group
)
!=
len
(
eval_set
):
raise
ValueError
(
"
l
ength of eval_group should equal
with
eval_set"
)
raise
ValueError
(
"
L
ength of eval_group should equal
to
eval_set"
)
else
:
for
inner_group
in
eval_group
:
if
inner_group
is
None
:
raise
ValueError
(
"should set group for all eval data for ranking task"
)
raise
ValueError
(
"Should set group for all eval dataset for ranking task"
)
if
eval_at
is
not
None
:
other_params
=
{}
if
other_params
is
None
else
other_params
other_params
[
'ndcg_eval_at'
]
=
list
(
eval_at
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment