Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
tianlh
LightGBM-DCU
Commits
b51c7be4
Commit
b51c7be4
authored
Dec 08, 2016
by
Guolin Ke
Committed by
GitHub
Dec 08, 2016
Browse files
refine Dataset class (#113)
Provide a high level Dataset class for easy use.
parent
f3d33582
Changes
12
Hide whitespace changes
Inline
Side-by-side
Showing
12 changed files
with
683 additions
and
256 deletions
+683
-256
.travis.yml
.travis.yml
+3
-3
CMakeLists.txt
CMakeLists.txt
+1
-1
examples/python-guide/simple_example.py
examples/python-guide/simple_example.py
+2
-10
include/LightGBM/config.h
include/LightGBM/config.h
+2
-0
python-package/lightgbm/__init__.py
python-package/lightgbm/__init__.py
+1
-1
python-package/lightgbm/basic.py
python-package/lightgbm/basic.py
+403
-81
python-package/lightgbm/engine.py
python-package/lightgbm/engine.py
+49
-105
python-package/lightgbm/libpath.py
python-package/lightgbm/libpath.py
+1
-0
python-package/lightgbm/sklearn.py
python-package/lightgbm/sklearn.py
+131
-49
src/io/config.cpp
src/io/config.cpp
+13
-2
tests/python_package_test/test_basic.py
tests/python_package_test/test_basic.py
+0
-4
tests/python_package_test/test_engine.py
tests/python_package_test/test_engine.py
+77
-0
No files found.
.travis.yml
View file @
b51c7be4
...
...
@@ -14,7 +14,7 @@ before_install:
install
:
-
sudo apt-get install -y libopenmpi-dev openmpi-bin build-essential
-
conda install --yes atlas numpy scipy scikit-learn
-
conda install --yes atlas numpy scipy scikit-learn
pandas
script
:
...
...
@@ -22,12 +22,12 @@ script:
-
mkdir build && cd build && cmake .. && make -j
-
cd $TRAVIS_BUILD_DIR/tests/c_api_test && python test.py
-
cd $TRAVIS_BUILD_DIR/python-package && python setup.py install
-
cd $TRAVIS_BUILD_DIR/tests/python_package_test && python test_basic.py && python test_sklearn.py
-
cd $TRAVIS_BUILD_DIR/tests/python_package_test && python test_basic.py && python
test_engine.py && python
test_sklearn.py
-
cd $TRAVIS_BUILD_DIR
-
rm -rf build && mkdir build && cd build && cmake -DUSE_MPI=ON ..&& make -j
-
cd $TRAVIS_BUILD_DIR/tests/c_api_test && python test.py
-
cd $TRAVIS_BUILD_DIR/python-package && python setup.py install
-
cd $TRAVIS_BUILD_DIR/tests/python_package_test && python test_basic.py && python test_sklearn.py
-
cd $TRAVIS_BUILD_DIR/tests/python_package_test && python test_basic.py && python
test_engine.py && python
test_sklearn.py
notifications
:
email
:
false
...
...
CMakeLists.txt
View file @
b51c7be4
...
...
@@ -76,7 +76,7 @@ add_executable(lightgbm src/main.cpp ${SOURCES})
add_library
(
_lightgbm SHARED src/c_api.cpp
${
SOURCES
}
)
if
(
MSVC
)
set_target_properties
(
_lightgbm PROPERTIES OUTPUT_NAME
"lightgbm"
)
set_target_properties
(
_lightgbm PROPERTIES OUTPUT_NAME
"
lib_
lightgbm"
)
endif
(
MSVC
)
if
(
USE_MPI
)
...
...
examples/python-guide/simple_example.py
View file @
b51c7be4
...
...
@@ -17,13 +17,7 @@ X_test = df_test.drop(0, axis=1)
# create dataset for lightgbm
lgb_train
=
lgb
.
Dataset
(
X_train
,
y_train
)
lgb_eval
=
lgb
.
Dataset
(
X_test
,
y_test
,
reference
=
lgb_train
)
# ATTENTION: you should carefully use lightgbm.Dataset
# it requires setting up categorical_feature when you init it
# rather than passing from lightgbm.train
# instead, you can simply use a tuple of length=2 like below
# it will help you construct Datasets with parameters in lightgbm.train
lgb_train
=
(
X_train
,
y_train
)
lgb_eval
=
(
X_test
,
y_test
)
# specify your configurations as a dict
params
=
{
...
...
@@ -43,9 +37,7 @@ params = {
gbm
=
lgb
.
train
(
params
,
lgb_train
,
num_boost_round
=
100
,
valid_datas
=
lgb_eval
,
# you can use a list to represent multiple valid_datas/valid_names
# don't use tuple, tuple is used to represent one dataset
valid_sets
=
lgb_eval
,
early_stopping_rounds
=
10
)
# save model to file
...
...
include/LightGBM/config.h
View file @
b51c7be4
...
...
@@ -230,6 +230,7 @@ struct OverallConfig: public ConfigBase {
public:
TaskType
task_type
=
TaskType
::
kTrain
;
NetworkConfig
network_config
;
int
seed
=
0
;
int
num_threads
=
0
;
bool
is_parallel
=
false
;
bool
is_parallel_find_bin
=
false
;
...
...
@@ -317,6 +318,7 @@ struct ParameterAlias {
{
{
"config"
,
"config_file"
},
{
"nthread"
,
"num_threads"
},
{
"random_seed"
,
"seed"
},
{
"num_thread"
,
"num_threads"
},
{
"boosting"
,
"boosting_type"
},
{
"boost"
,
"boosting_type"
},
...
...
python-package/lightgbm/__init__.py
View file @
b51c7be4
...
...
@@ -8,7 +8,7 @@ from __future__ import absolute_import
import
os
from
.basic
import
Predictor
,
Dataset
,
Booster
from
.basic
import
Dataset
,
Booster
from
.engine
import
train
,
cv
try
:
from
.sklearn
import
LGBMModel
,
LGBMRegressor
,
LGBMClassifier
,
LGBMRanker
...
...
python-package/lightgbm/basic.py
View file @
b51c7be4
...
...
@@ -14,7 +14,7 @@ import scipy.sparse
from
.libpath
import
find_lib_path
#
pandas
"""
pandas
"""
try
:
from
pandas
import
Series
,
DataFrame
IS_PANDAS_INSTALLED
=
True
...
...
@@ -53,22 +53,27 @@ def _safe_call(ret):
raise
LightGBMError
(
_LIB
.
LGBM_GetLastError
())
def
is_str
(
s
):
"""Check is a str or not"""
if
IS_PY3
:
return
isinstance
(
s
,
str
)
else
:
return
isinstance
(
s
,
basestring
)
def
is_numpy_object
(
data
):
"""Check is numpy object"""
return
type
(
data
).
__module__
==
np
.
__name__
def
is_numpy_1d_array
(
data
):
"""Check is 1d numpy array"""
return
isinstance
(
data
,
np
.
ndarray
)
and
len
(
data
.
shape
)
==
1
def
is_1d_list
(
data
):
"""Check is 1d list"""
return
isinstance
(
data
,
list
)
and
\
(
not
data
or
isinstance
(
data
[
0
],
(
int
,
float
,
bool
)))
def
list_to_1d_numpy
(
data
,
dtype
):
"""convert to 1d numpy array"""
if
is_numpy_1d_array
(
data
):
if
data
.
dtype
==
dtype
:
return
data
...
...
@@ -112,7 +117,7 @@ def param_dict_to_str(data):
return
""
pairs
=
[]
for
key
,
val
in
data
.
items
():
if
is_str
(
val
)
or
isinstance
(
val
,
(
int
,
float
,
bool
)):
if
is_str
(
val
)
or
isinstance
(
val
,
(
int
,
float
,
bool
,
np
.
integer
,
np
.
float
,
np
.
float32
)):
pairs
.
append
(
str
(
key
)
+
'='
+
str
(
val
))
elif
isinstance
(
val
,
(
list
,
tuple
,
set
)):
pairs
.
append
(
str
(
key
)
+
'='
+
','
.
join
(
map
(
str
,
val
)))
...
...
@@ -126,20 +131,23 @@ C_API_DTYPE_FLOAT32 = 0
C_API_DTYPE_FLOAT64
=
1
C_API_DTYPE_INT32
=
2
C_API_DTYPE_INT64
=
3
"""Matric is row major in python"""
C_API_IS_ROW_MAJOR
=
1
"""marco definition of prediction type in c_api of LightGBM"""
C_API_PREDICT_NORMAL
=
0
C_API_PREDICT_RAW_SCORE
=
1
C_API_PREDICT_LEAF_INDEX
=
2
"""data type of data field"""
FIELD_TYPE_MAPPER
=
{
"label"
:
C_API_DTYPE_FLOAT32
,
"weight"
:
C_API_DTYPE_FLOAT32
,
"init_score"
:
C_API_DTYPE_FLOAT32
,
"group"
:
C_API_DTYPE_INT32
}
def
c_float_array
(
data
):
"""
Conver
t numpy array / list
to c float array.
"""
"""
get pointer of floa
t numpy array / list"""
if
is_1d_list
(
data
):
data
=
np
.
array
(
data
,
copy
=
False
)
if
is_numpy_1d_array
(
data
):
...
...
@@ -157,7 +165,7 @@ def c_float_array(data):
return
(
ptr_data
,
type_data
)
def
c_int_array
(
data
):
"""
Convert numpy array to c int array.
"""
"""
get pointer of int numpy array / list
"""
if
is_1d_list
(
data
):
data
=
np
.
array
(
data
,
copy
=
False
)
if
is_numpy_1d_array
(
data
):
...
...
@@ -174,16 +182,21 @@ def c_int_array(data):
raise
TypeError
(
"Unknow type({})"
.
format
(
type
(
data
).
__name__
))
return
(
ptr_data
,
type_data
)
class
Predictor
(
object
):
""""A Predictor of LightGBM.
class
_InnerPredictor
(
object
):
"""
A _InnerPredictor of LightGBM.
Only used for prediction, usually used for continued-train
Note: Can convert from Booster, but cannot convert to Booster
"""
def
__init__
(
self
,
model_file
=
None
,
booster_handle
=
None
,
is_manage_handle
=
True
):
"""Initialize the Predictor.
def
__init__
(
self
,
model_file
=
None
,
booster_handle
=
None
):
"""Initialize the
_Inner
Predictor.
Not expose to user
Parameters
----------
model_file : string
Path to the model file.
booster_handle : Handle of Booster
use handle to init
"""
self
.
handle
=
ctypes
.
c_void_p
()
self
.
__is_manage_handle
=
True
...
...
@@ -201,7 +214,7 @@ class Predictor(object):
self
.
num_class
=
out_num_class
.
value
self
.
num_total_iteration
=
out_num_iterations
.
value
elif
booster_handle
is
not
None
:
self
.
__is_manage_handle
=
is_manage_handl
e
self
.
__is_manage_handle
=
Fals
e
self
.
handle
=
booster_handle
out_num_class
=
ctypes
.
c_int64
(
0
)
_safe_call
(
_LIB
.
LGBM_BoosterGetNumClasses
(
...
...
@@ -214,7 +227,7 @@ class Predictor(object):
ctypes
.
byref
(
out_num_iterations
)))
self
.
num_total_iteration
=
out_num_iterations
.
value
else
:
raise
TypeError
(
'Need Model file to create a
booste
r'
)
raise
TypeError
(
'Need Model file
or Booster handle
to create a
predicto
r'
)
def
__del__
(
self
):
if
self
.
__is_manage_handle
:
...
...
@@ -239,7 +252,7 @@ class Predictor(object):
pred_leaf : bool
True for predict leaf index
data_has_header : bool
Used for txt data
Used for txt data
, True if txt data has header
is_reshape : bool
Reshape to (nrow, ncol) if true
...
...
@@ -247,7 +260,7 @@ class Predictor(object):
-------
Prediction result
"""
if
isinstance
(
data
,
Dataset
):
if
isinstance
(
data
,
(
_InnerDataset
,
Dataset
)
):
raise
TypeError
(
"cannot use Dataset instance for prediction, please use raw data instead"
)
predict_type
=
C_API_PREDICT_NORMAL
if
raw_score
:
...
...
@@ -299,6 +312,9 @@ class Predictor(object):
return
preds
def
__get_num_preds
(
self
,
num_iteration
,
nrow
,
predict_type
):
"""
Get size of prediction result
"""
n_preds
=
self
.
num_class
*
nrow
if
predict_type
==
C_API_PREDICT_LEAF_INDEX
:
if
num_iteration
>
0
:
...
...
@@ -398,10 +414,10 @@ def _label_from_pandas(label):
label
=
label
.
values
.
astype
(
'float'
)
return
label
class
Dataset
(
object
):
"""Dataset used in LightGBM.
Dataset is a internal data structure that used by LightGBM
class
_Inner
Dataset
(
object
):
"""
_Inner
Dataset used in LightGBM.
_InnerDataset is a internal data structure that used by LightGBM.
This class is not exposed. Please use Dataset instead
"""
def
__init__
(
self
,
data
,
label
=
None
,
max_bin
=
255
,
reference
=
None
,
...
...
@@ -409,23 +425,25 @@ class Dataset(object):
silent
=
False
,
feature_name
=
None
,
categorical_feature
=
None
,
params
=
None
):
"""
Dataset used in LightGBM.
_Inner
Dataset used in LightGBM.
Parameters
----------
data : string/numpy array/scipy.sparse
Data source of Dataset.
Data source of
_Inner
Dataset.
When data type is string, it represents the path of txt file
label : list or numpy 1-D array, optional
Label of the data
max_bin : int, required
Max number of discrete bin for features
reference : Other Dataset, optional
reference : Other
_Inner
Dataset, optional
If this dataset validation, need to use training data as reference
weight : list or numpy 1-D array , optional
Weight for each instance.
group : list or numpy 1-D array , optional
Group/query size for dataset
predictor : _InnerPredictor
Used for continuned train
silent : boolean, optional
Whether print messages during construction
feature_name : list of str
...
...
@@ -436,10 +454,6 @@ class Dataset(object):
params: dict, optional
Other parameters
"""
self
.
__label
=
None
self
.
__weight
=
None
self
.
__init_score
=
None
self
.
__group
=
None
if
data
is
None
:
self
.
handle
=
None
return
...
...
@@ -475,7 +489,7 @@ class Dataset(object):
params_str
=
param_dict_to_str
(
params
)
"""process for reference dataset"""
ref_dataset
=
None
if
isinstance
(
reference
,
Dataset
):
if
isinstance
(
reference
,
_Inner
Dataset
):
ref_dataset
=
ctypes
.
byref
(
reference
.
handle
)
elif
reference
is
not
None
:
raise
TypeError
(
'Reference dataset should be None or dataset instance'
)
...
...
@@ -500,7 +514,7 @@ class Dataset(object):
csr
=
scipy
.
sparse
.
csr_matrix
(
data
)
self
.
__init_from_csr
(
csr
,
params_str
,
ref_dataset
)
except
:
raise
TypeError
(
'can not initialize Dataset from {}'
.
format
(
type
(
data
).
__name__
))
raise
TypeError
(
'can not initialize
_Inner
Dataset from {}'
.
format
(
type
(
data
).
__name__
))
if
label
is
not
None
:
self
.
set_label
(
label
)
if
self
.
get_label
()
is
None
:
...
...
@@ -510,7 +524,7 @@ class Dataset(object):
if
group
is
not
None
:
self
.
set_group
(
group
)
# load init score
if
self
.
predictor
is
not
None
and
isinstance
(
self
.
predictor
,
Predictor
):
if
isinstance
(
self
.
predictor
,
_Inner
Predictor
):
init_score
=
self
.
predictor
.
predict
(
data
,
raw_score
=
True
,
data_has_header
=
self
.
data_has_header
,
...
...
@@ -524,6 +538,8 @@ class Dataset(object):
new_init_score
[
j
*
num_data
+
i
]
=
init_score
[
i
*
self
.
predictor
.
num_class
+
j
]
init_score
=
new_init_score
self
.
set_init_score
(
init_score
)
elif
self
.
predictor
is
not
None
:
raise
TypeError
(
'wrong predictor type {}'
.
format
(
type
(
self
.
predictor
).
__name__
))
# set feature names
self
.
set_feature_name
(
feature_name
)
...
...
@@ -535,7 +551,7 @@ class Dataset(object):
Parameters
----------
data : string/numpy array/scipy.sparse
Data source of Dataset.
Data source of
_Inner
Dataset.
When data type is string, it represents the path of txt file
label : list or numpy 1-D array, optional
Label of the training data.
...
...
@@ -548,16 +564,16 @@ class Dataset(object):
params: dict, optional
Other parameters
"""
return
Dataset
(
data
,
label
=
label
,
max_bin
=
self
.
max_bin
,
reference
=
self
,
weight
=
weight
,
group
=
group
,
predictor
=
self
.
predictor
,
silent
=
silent
,
params
=
params
)
return
_Inner
Dataset
(
data
,
label
=
label
,
max_bin
=
self
.
max_bin
,
reference
=
self
,
weight
=
weight
,
group
=
group
,
predictor
=
self
.
predictor
,
silent
=
silent
,
params
=
params
)
def
subset
(
self
,
used_indices
,
params
=
None
):
"""
Get subset of current dataset
"""
used_indices
=
list_to_1d_numpy
(
used_indices
,
np
.
int32
)
ret
=
Dataset
(
None
)
ret
=
_Inner
Dataset
(
None
)
ret
.
handle
=
ctypes
.
c_void_p
()
params_str
=
param_dict_to_str
(
params
)
_safe_call
(
_LIB
.
LGBM_DatasetGetSubset
(
...
...
@@ -573,6 +589,9 @@ class Dataset(object):
return
ret
def
set_feature_name
(
self
,
feature_name
):
"""
set feature names
"""
if
feature_name
is
None
:
return
if
len
(
feature_name
)
!=
self
.
num_feature
():
...
...
@@ -636,7 +655,7 @@ class Dataset(object):
_safe_call
(
_LIB
.
LGBM_DatasetFree
(
self
.
handle
))
def
get_field
(
self
,
field_name
):
"""Get property from the Dataset.
"""Get property from the
_Inner
Dataset.
Parameters
----------
...
...
@@ -669,7 +688,7 @@ class Dataset(object):
raise
TypeError
(
"unknow type"
)
def
set_field
(
self
,
field_name
,
data
):
"""Set property into the Dataset.
"""Set property into the
_Inner
Dataset.
Parameters
----------
...
...
@@ -711,7 +730,7 @@ class Dataset(object):
type_data
))
def
save_binary
(
self
,
filename
):
"""Save Dataset to binary file
"""Save
_Inner
Dataset to binary file
Parameters
----------
...
...
@@ -723,15 +742,14 @@ class Dataset(object):
c_str
(
filename
)))
def
set_label
(
self
,
label
):
"""Set label of Dataset
"""Set label of
_Inner
Dataset
Parameters
----------
label: numpy array or list or None
The label information to be set into Dataset
The label information to be set into
_Inner
Dataset
"""
label
=
list_to_1d_numpy
(
label
,
np
.
float32
)
self
.
__label
=
label
self
.
set_field
(
'label'
,
label
)
def
set_weight
(
self
,
weight
):
...
...
@@ -744,7 +762,6 @@ class Dataset(object):
"""
if
weight
is
not
None
:
weight
=
list_to_1d_numpy
(
weight
,
np
.
float32
)
self
.
__weight
=
weight
self
.
set_field
(
'weight'
,
weight
)
def
set_init_score
(
self
,
score
):
...
...
@@ -757,11 +774,10 @@ class Dataset(object):
"""
if
score
is
not
None
:
score
=
list_to_1d_numpy
(
score
,
np
.
float32
)
self
.
__init_score
=
score
self
.
set_field
(
'init_score'
,
score
)
def
set_group
(
self
,
group
):
"""Set group size of Dataset (used for ranking).
"""Set group size of
_Inner
Dataset (used for ranking).
Parameters
----------
...
...
@@ -770,57 +786,46 @@ class Dataset(object):
"""
if
group
is
not
None
:
group
=
list_to_1d_numpy
(
group
,
np
.
int32
)
self
.
__group
=
group
self
.
set_field
(
'group'
,
group
)
def
get_label
(
self
):
"""Get the label of the Dataset.
"""Get the label of the
_Inner
Dataset.
Returns
-------
label : array
"""
if
self
.
__label
is
None
:
self
.
__label
=
self
.
get_field
(
'label'
)
if
self
.
__label
is
None
:
raise
TypeError
(
"label should not be None"
)
return
self
.
__label
return
self
.
get_field
(
'label'
)
def
get_weight
(
self
):
"""Get the weight of the Dataset.
"""Get the weight of the
_Inner
Dataset.
Returns
-------
weight : array
"""
if
self
.
__weight
is
None
:
self
.
__weight
=
self
.
get_field
(
'weight'
)
return
self
.
__weight
return
self
.
get_field
(
'weight'
)
def
get_init_score
(
self
):
"""Get the initial score of the Dataset.
"""Get the initial score of the
_Inner
Dataset.
Returns
-------
init_score : array
"""
if
self
.
__init_score
is
None
:
self
.
__init_score
=
self
.
get_field
(
'init_score'
)
return
self
.
__init_score
return
self
.
get_field
(
'init_score'
)
def
get_group
(
self
):
"""Get the initial score of the Dataset.
"""Get the initial score of the
_Inner
Dataset.
Returns
-------
init_score : array
"""
if
self
.
__group
is
None
:
self
.
__group
=
self
.
get_field
(
'group'
)
return
self
.
__group
return
self
.
get_field
(
'group'
)
def
num_data
(
self
):
"""Get the number of rows in the Dataset.
"""Get the number of rows in the
_Inner
Dataset.
Returns
-------
...
...
@@ -832,7 +837,7 @@ class Dataset(object):
return
ret
.
value
def
num_feature
(
self
):
"""Get the number of columns (features) in the Dataset.
"""Get the number of columns (features) in the
_Inner
Dataset.
Returns
-------
...
...
@@ -843,6 +848,326 @@ class Dataset(object):
ctypes
.
byref
(
ret
)))
return
ret
.
value
class
Dataset
(
object
):
"""High level Dataset used in LightGBM.
"""
def
__init__
(
self
,
data
,
label
=
None
,
max_bin
=
255
,
reference
=
None
,
weight
=
None
,
group
=
None
,
silent
=
False
,
feature_name
=
None
,
categorical_feature
=
None
,
params
=
None
,
free_raw_data
=
True
):
"""
Parameters
----------
data : string/numpy array/scipy.sparse
Data source of Dataset.
When data type is string, it represents the path of txt file
label : list or numpy 1-D array, optional
Label of the data
max_bin : int, required
Max number of discrete bin for features
reference : Other Dataset, optional
If this dataset validation, need to use training data as reference
weight : list or numpy 1-D array , optional
Weight for each instance.
group : list or numpy 1-D array , optional
Group/query size for dataset
silent : boolean, optional
Whether print messages during construction
feature_name : list of str
Feature names
categorical_feature : list of str or int
Categorical features, type int represents index,
\
type str represents feature names (need to specify feature_name as well)
params: dict, optional
Other parameters
free_raw_data: Bool
True if need to free raw data after construct inner dataset
"""
self
.
data
=
data
self
.
label
=
label
self
.
max_bin
=
max_bin
self
.
reference
=
reference
self
.
weight
=
weight
self
.
group
=
group
self
.
silent
=
silent
self
.
feature_name
=
feature_name
self
.
categorical_feature
=
categorical_feature
self
.
params
=
params
self
.
free_raw_data
=
free_raw_data
self
.
inner_dataset
=
None
self
.
used_indices
=
None
self
.
_predictor
=
None
def
create_valid
(
self
,
data
,
label
=
None
,
weight
=
None
,
group
=
None
,
silent
=
False
,
params
=
None
):
"""
Create validation data align with current dataset
Parameters
----------
data : string/numpy array/scipy.sparse
Data source of _InnerDataset.
When data type is string, it represents the path of txt file
label : list or numpy 1-D array, optional
Label of the training data.
weight : list or numpy 1-D array , optional
Weight for each instance.
group : list or numpy 1-D array , optional
Group/query size for dataset
silent : boolean, optional
Whether print messages during construction
params: dict, optional
Other parameters
"""
ret
=
Dataset
(
data
,
label
=
label
,
max_bin
=
self
.
max_bin
,
reference
=
self
,
weight
=
weight
,
group
=
group
,
silent
=
silent
,
params
=
params
,
free_raw_data
=
self
.
free_raw_data
)
ret
.
_set_predictor
(
self
.
_predictor
)
return
ret
def
construct
(
self
):
"""Lazy init"""
if
self
.
inner_dataset
is
None
:
if
self
.
reference
is
not
None
:
if
self
.
used_indices
is
None
:
self
.
inner_dataset
=
self
.
reference
.
_get_inner_dataset
().
create_valid
(
self
.
data
,
self
.
label
,
self
.
weight
,
self
.
group
,
self
.
silent
,
self
.
params
)
else
:
"""construct subset"""
self
.
inner_dataset
=
self
.
reference
.
_get_inner_dataset
().
subset
(
self
.
used_indices
,
self
.
params
)
else
:
self
.
inner_dataset
=
_InnerDataset
(
self
.
data
,
self
.
label
,
self
.
max_bin
,
None
,
self
.
weight
,
self
.
group
,
self
.
_predictor
,
self
.
silent
,
self
.
feature_name
,
self
.
categorical_feature
,
self
.
params
)
if
self
.
free_raw_data
:
self
.
data
=
None
def
_get_inner_dataset
(
self
):
"""get inner dataset"""
self
.
construct
()
return
self
.
inner_dataset
def
__is_constructed
(
self
):
"""check inner_dataset is constructed or not"""
return
self
.
inner_dataset
is
not
None
def
set_categorical_feature
(
self
,
categorical_feature
):
"""
Set categorical features
Parameters
----------
categorical_feature : list of int or str
Name/index of categorical features
"""
if
self
.
categorical_feature
==
categorical_feature
:
return
if
self
.
data
is
not
None
:
self
.
categorical_feature
=
categorical_feature
self
.
inner_dataset
=
None
else
:
raise
LightGBMError
(
"Cannot set categorical feature after freed raw data,
\
Set free_raw_data=False when construct Dataset to avoid this."
)
def
_set_predictor
(
self
,
predictor
):
"""
Set predictor for continued training, not recommand for user to call this function.
Please set init_model in engine.train or engine.cv
"""
if
predictor
is
self
.
_predictor
:
return
if
self
.
data
is
not
None
:
self
.
_predictor
=
predictor
self
.
inner_dataset
=
None
else
:
raise
LightGBMError
(
"Cannot set predictor after freed raw data,
\
Set free_raw_data=False when construct Dataset to avoid this."
)
def
set_reference
(
self
,
reference
):
"""
Set reference dataset
Parameters
----------
reference : Dataset
will use reference as template to consturct current dataset
"""
self
.
set_categorical_feature
(
reference
.
categorical_feature
)
self
.
set_feature_name
(
reference
.
feature_name
)
self
.
_set_predictor
(
reference
.
_predictor
)
if
self
.
reference
is
reference
:
return
if
self
.
data
is
not
None
:
self
.
reference
=
reference
self
.
inner_dataset
=
None
else
:
raise
LightGBMError
(
"Cannot set reference after freed raw data,
\
Set free_raw_data=False when construct Dataset to avoid this."
)
def
set_feature_name
(
self
,
feature_name
):
"""
Set feature name
Parameters
----------
feature_name : list of str
feature names
"""
self
.
feature_name
=
feature_name
if
self
.
__is_constructed
():
self
.
inner_dataset
.
set_feature_name
(
self
.
feature_name
)
def
subset
(
self
,
used_indices
,
params
=
None
):
"""
Get subset of current dataset
Parameters
----------
used_indices : list of int
use indices of this subset
params : dict
other parameters
"""
ret
=
Dataset
(
None
)
ret
.
feature_name
=
self
.
feature_name
ret
.
categorical_feature
=
self
.
categorical_feature
ret
.
reference
=
self
ret
.
_predictor
=
self
.
_predictor
ret
.
used_indices
=
used_indices
ret
.
params
=
params
return
ret
def
save_binary
(
self
,
filename
):
"""Save Dataset to binary file
Parameters
----------
filename : string
Name of the output file.
"""
self
.
_get_inner_dataset
().
save_binary
(
filename
)
def
set_label
(
self
,
label
):
"""Set label of Dataset
Parameters
----------
label: numpy array or list or None
The label information to be set into Dataset
"""
self
.
label
=
label
if
self
.
__is_constructed
():
self
.
inner_dataset
.
set_label
(
self
.
label
)
def
set_weight
(
self
,
weight
):
""" Set weight of each instance.
Parameters
----------
weight : numpy array or list or None
Weight for each data point
"""
self
.
weight
=
weight
if
self
.
__is_constructed
():
self
.
inner_dataset
.
set_weight
(
self
.
weight
)
def
set_init_score
(
self
,
init_score
):
""" Set init score of booster to start from.
Parameters
----------
init_score: numpy array or list or None
Init score for booster
"""
self
.
init_score
=
init_score
if
self
.
__is_constructed
():
self
.
inner_dataset
.
set_init_score
(
self
.
init_score
)
def
set_group
(
self
,
group
):
"""Set group size of Dataset (used for ranking).
Parameters
----------
group : numpy array or list or None
Group size of each group
"""
self
.
group
=
group
if
self
.
__is_constructed
():
self
.
inner_dataset
.
set_group
(
self
.
group
)
def
get_label
(
self
):
"""Get the label of the Dataset.
Returns
-------
label : array
"""
if
self
.
label
is
None
and
self
.
__is_constructed
():
self
.
label
=
self
.
inner_dataset
.
get_label
()
return
self
.
label
def
get_weight
(
self
):
"""Get the weight of the Dataset.
Returns
-------
weight : array
"""
if
self
.
weight
is
None
and
self
.
__is_constructed
():
self
.
weight
=
self
.
inner_dataset
.
get_weight
()
return
self
.
weight
def
get_init_score
(
self
):
"""Get the initial score of the Dataset.
Returns
-------
init_score : array
"""
if
self
.
init_score
is
None
and
self
.
__is_constructed
():
self
.
init_score
=
self
.
inner_dataset
.
get_init_score
()
return
self
.
init_score
def
get_group
(
self
):
"""Get the initial score of the Dataset.
Returns
-------
init_score : array
"""
if
self
.
group
is
None
and
self
.
__is_constructed
():
self
.
group
=
self
.
inner_dataset
.
get_group
()
return
self
.
group
def
num_data
(
self
):
"""Get the number of rows in the Dataset.
Returns
-------
number of rows : int
"""
if
self
.
__is_constructed
():
return
self
.
inner_dataset
.
num_data
()
else
:
raise
LightGBMError
(
"Cannot call num_data before construct, please call it explicitly"
)
def
num_feature
(
self
):
"""Get the number of columns (features) in the Dataset.
Returns
-------
number of columns : int
"""
if
self
.
__is_constructed
():
return
self
.
inner_dataset
.
num_feature
()
else
:
raise
LightGBMError
(
"Cannot call num_feature before construct, please call it explicitly"
)
class
Booster
(
object
):
""""A Booster of LightGBM.
"""
...
...
@@ -862,7 +1187,6 @@ class Booster(object):
"""
self
.
handle
=
ctypes
.
c_void_p
()
self
.
__need_reload_eval_info
=
True
self
.
__is_manage_handle
=
True
self
.
__train_data_name
=
"training"
self
.
__attr
=
{}
self
.
best_iteration
=
-
1
...
...
@@ -878,7 +1202,7 @@ class Booster(object):
params_str
=
param_dict_to_str
(
params
)
"""construct booster object"""
_safe_call
(
_LIB
.
LGBM_BoosterCreate
(
train_set
.
handle
,
train_set
.
_get_inner_dataset
().
handle
,
c_str
(
params_str
),
ctypes
.
byref
(
self
.
handle
)))
"""save reference to data"""
...
...
@@ -886,11 +1210,11 @@ class Booster(object):
self
.
valid_sets
=
[]
self
.
name_valid_sets
=
[]
self
.
__num_dataset
=
1
self
.
init_predictor
=
train_set
.
predictor
if
self
.
init_predictor
is
not
None
:
self
.
__
init_predictor
=
train_set
.
_
predictor
if
self
.
__
init_predictor
is
not
None
:
_safe_call
(
_LIB
.
LGBM_BoosterMerge
(
self
.
handle
,
self
.
init_predictor
.
handle
))
self
.
__
init_predictor
.
handle
))
out_num_class
=
ctypes
.
c_int64
(
0
)
_safe_call
(
_LIB
.
LGBM_BoosterGetNumClasses
(
self
.
handle
,
...
...
@@ -916,7 +1240,7 @@ class Booster(object):
raise
TypeError
(
'At least need training dataset or model file to create booster instance'
)
def
__del__
(
self
):
if
self
.
handle
is
not
None
and
self
.
__is_manage_handle
:
if
self
.
handle
is
not
None
:
_safe_call
(
_LIB
.
LGBM_BoosterFree
(
self
.
handle
))
def
set_train_data_name
(
self
,
name
):
...
...
@@ -932,11 +1256,11 @@ class Booster(object):
name : String
Name of validation data
"""
if
data
.
predictor
is
not
self
.
init_predictor
:
raise
Exception
(
"Add validation data failed, you should use same predictor for these data"
)
if
data
.
_
predictor
is
not
self
.
__
init_predictor
:
raise
LightGBMError
(
"Add validation data failed, you should use same predictor for these data"
)
_safe_call
(
_LIB
.
LGBM_BoosterAddValidData
(
self
.
handle
,
data
.
handle
))
data
.
_get_inner_dataset
().
handle
))
self
.
valid_sets
.
append
(
data
)
self
.
name_valid_sets
.
append
(
name
)
self
.
__num_dataset
+=
1
...
...
@@ -982,12 +1306,12 @@ class Booster(object):
"""need reset training data"""
if
train_set
is
not
None
and
train_set
is
not
self
.
train_set
:
if
train_set
.
predictor
is
not
self
.
init_predictor
:
raise
Exception
(
"Replace training data failed, you should use same predictor for these data"
)
if
train_set
.
_
predictor
is
not
self
.
__
init_predictor
:
raise
LightGBMError
(
"Replace training data failed, you should use same predictor for these data"
)
self
.
train_set
=
train_set
_safe_call
(
_LIB
.
LGBM_BoosterResetTrainingData
(
self
.
handle
,
self
.
train_set
.
handle
))
self
.
train_set
.
_get_inner_dataset
().
handle
))
self
.
__inner_predict_buffer
[
0
]
=
None
is_finished
=
ctypes
.
c_int
(
0
)
if
fobj
is
None
:
...
...
@@ -1063,7 +1387,7 @@ class Booster(object):
Parameters
----------
data : Dataset object
data :
_Inner
Dataset object
name :
Name of data
feval : function
...
...
@@ -1073,8 +1397,8 @@ class Booster(object):
result: list
Evaluation result list.
"""
if
not
isinstance
(
data
,
Dataset
):
raise
TypeError
(
"Can only eval for Dataset instance"
)
if
not
isinstance
(
data
,
_Inner
Dataset
):
raise
TypeError
(
"Can only eval for
_Inner
Dataset instance"
)
data_idx
=
-
1
if
data
is
self
.
train_set
:
data_idx
=
0
...
...
@@ -1187,15 +1511,13 @@ class Booster(object):
-------
Prediction result
"""
predictor
=
Predictor
(
booster_handle
=
self
.
handle
,
is_manage_handle
=
False
)
predictor
=
_Inner
Predictor
(
booster_handle
=
self
.
handle
)
return
predictor
.
predict
(
data
,
num_iteration
,
raw_score
,
pred_leaf
,
data_has_header
,
is_reshape
)
def
to_predictor
(
self
):
def
_
to_predictor
(
self
):
"""Convert to predictor
Note: Predictor will manage the handle after doing this
"""
predictor
=
Predictor
(
booster_handle
=
self
.
handle
,
is_manage_handle
=
True
)
self
.
__is_manage_handle
=
False
predictor
=
_InnerPredictor
(
booster_handle
=
self
.
handle
)
return
predictor
def
feature_importance
(
self
,
importance_type
=
'split'
):
...
...
python-package/lightgbm/engine.py
View file @
b51c7be4
...
...
@@ -6,52 +6,12 @@ from __future__ import absolute_import
import
collections
from
operator
import
attrgetter
import
numpy
as
np
from
.basic
import
LightGBMError
,
Predictor
,
Dataset
,
Booster
,
is_str
from
.basic
import
LightGBMError
,
_Inner
Predictor
,
Dataset
,
Booster
,
is_str
from
.
import
callback
def
_construct_dataset
(
X_y
,
reference
=
None
,
params
=
None
,
other_fields
=
None
,
feature_name
=
None
,
categorical_feature
=
None
,
predictor
=
None
):
if
'max_bin'
in
params
:
max_bin
=
int
(
params
[
'max_bin'
])
else
:
max_bin
=
255
weight
=
None
group
=
None
init_score
=
None
if
other_fields
is
not
None
:
if
not
isinstance
(
other_fields
,
dict
):
raise
TypeError
(
"type of other filed data should be dict"
)
weight
=
other_fields
.
get
(
'weight'
,
None
)
group
=
other_fields
.
get
(
'group'
,
None
)
init_score
=
other_fields
.
get
(
'init_score'
,
None
)
if
is_str
(
X_y
):
data
=
X_y
label
=
None
else
:
if
len
(
X_y
)
!=
2
:
raise
TypeError
(
"should pass (data, label) tuple for dataset"
)
data
=
X_y
[
0
]
label
=
X_y
[
1
]
if
reference
is
None
:
ret
=
Dataset
(
data
,
label
=
label
,
max_bin
=
max_bin
,
weight
=
weight
,
group
=
group
,
predictor
=
predictor
,
feature_name
=
feature_name
,
categorical_feature
=
categorical_feature
,
params
=
params
)
else
:
ret
=
reference
.
create_valid
(
data
,
label
=
label
,
weight
=
weight
,
group
=
group
,
params
=
params
)
if
init_score
is
not
None
:
ret
.
set_init_score
(
init_score
)
return
ret
def
train
(
params
,
train_data
,
num_boost_round
=
100
,
valid_datas
=
None
,
valid_names
=
None
,
def
train
(
params
,
train_set
,
num_boost_round
=
100
,
valid_sets
=
None
,
valid_names
=
None
,
fobj
=
None
,
feval
=
None
,
init_model
=
None
,
train_fields
=
None
,
valid_fields
=
None
,
feature_name
=
None
,
categorical_feature
=
None
,
early_stopping_rounds
=
None
,
evals_result
=
None
,
verbose_eval
=
True
,
learning_rates
=
None
,
callbacks
=
None
):
...
...
@@ -61,14 +21,14 @@ def train(params, train_data, num_boost_round=100,
----------
params : dict
Parameters for training.
train_
data
: Dataset
, tuple (X, y) or filename of data
train_
set
: Dataset
Data to be trained.
num_boost_round: int
Number of boosting iterations.
valid_
data
s: list of Datasets
, tuples (valid_X, valid_y) or filenames of data
valid_
set
s: list of Datasets
List of data to be evaluated during training
valid_names: list of string
Names of valid_
data
s
Names of valid_
set
s
fobj : function
Customized objective function.
feval : function
...
...
@@ -76,13 +36,6 @@ def train(params, train_data, num_boost_round=100,
Note: should return (eval_name, eval_result, is_higher_better) of list of this
init_model : file name of lightgbm model or 'Booster' instance
model used for continued train
train_fields : dict
Other data file in training data. e.g. train_fields['weight'] is weight data
Support fields: weight, group, init_score
valid_fields : dict
Other data file in training data.
\
e.g. valid_fields[0]['weight'] is weight data for first valid data
Support fields: weight, group, init_score
feature_name : list of str
Feature names
categorical_feature : list of str or int
...
...
@@ -95,8 +48,8 @@ def train(params, train_data, num_boost_round=100,
Returns the model with (best_iter + early_stopping_rounds)
If early stopping occurs, the model will add 'best_iteration' field
evals_result: dict or None
This dictionary used to store all evaluation results of all the items in valid_
data
s.
Example: with a valid_
data
s containing [valid_set, train_set]
\
This dictionary used to store all evaluation results of all the items in valid_
set
s.
Example: with a valid_
set
s containing [valid_set, train_set]
\
and valid_names containing ['eval', 'train'] and a paramater containing ('metric':'logloss')
Returns: {'train': {'logloss': ['0.48253', '0.35953', ...]},
'eval': {'logloss': ['0.480385', '0.357756', ...]}}
...
...
@@ -127,58 +80,40 @@ def train(params, train_data, num_boost_round=100,
"""
"""create predictor first"""
if
is_str
(
init_model
):
predictor
=
Predictor
(
model_file
=
init_model
)
predictor
=
_Inner
Predictor
(
model_file
=
init_model
)
elif
isinstance
(
init_model
,
Booster
):
predictor
=
init_model
.
to_predictor
()
elif
isinstance
(
init_model
,
Predictor
):
predictor
=
init_model
predictor
=
init_model
.
_to_predictor
()
else
:
predictor
=
None
init_iteration
=
predictor
.
num_total_iteration
if
predictor
else
0
"""create dataset"""
if
isinstance
(
train_data
,
Dataset
):
train_set
=
train_data
if
train_fields
is
not
None
:
for
field
,
data
in
train_fields
.
items
():
train_set
.
set_field
(
field
,
data
)
else
:
train_set
=
_construct_dataset
(
train_data
,
None
,
params
,
other_fields
=
train_fields
,
feature_name
=
feature_name
,
categorical_feature
=
categorical_feature
,
predictor
=
predictor
)
"""check dataset"""
if
not
isinstance
(
train_set
,
Dataset
):
raise
TypeError
(
"only can accept Dataset instance for traninig"
)
train_set
.
_set_predictor
(
predictor
)
train_set
.
set_feature_name
(
feature_name
)
train_set
.
set_categorical_feature
(
categorical_feature
)
is_valid_contain_train
=
False
train_data_name
=
"training"
valid_sets
=
[]
reduced_
valid_sets
=
[]
name_valid_sets
=
[]
if
valid_
data
s
:
if
isinstance
(
valid_
data
s
,
(
Dataset
,
tuple
)
):
valid_
data
s
=
[
valid_
data
s
]
if
valid_
set
s
:
if
isinstance
(
valid_
set
s
,
Dataset
):
valid_
set
s
=
[
valid_
set
s
]
if
isinstance
(
valid_names
,
str
):
valid_names
=
[
valid_names
]
for
i
,
valid_data
in
enumerate
(
valid_datas
):
other_fields
=
None
if
valid_fields
is
None
else
valid_fields
.
get
(
i
,
None
)
for
i
,
valid_data
in
enumerate
(
valid_sets
):
"""reduce cost for prediction training data"""
if
valid_data
[
0
]
is
train_
data
[
0
]
and
valid_data
[
1
]
is
train_data
[
1
]
:
if
valid_data
is
train_
set
:
is_valid_contain_train
=
True
if
valid_names
is
not
None
:
train_data_name
=
valid_names
[
i
]
continue
if
isinstance
(
valid_data
,
Dataset
):
valid_set
=
valid_data
if
other_fields
is
not
None
:
for
field
,
data
in
other_fields
.
items
():
valid_set
.
set_field
(
field
,
data
)
else
:
valid_set
=
_construct_dataset
(
valid_data
,
train_set
,
params
,
other_fields
=
other_fields
,
feature_name
=
feature_name
,
categorical_feature
=
categorical_feature
,
predictor
=
predictor
)
valid_sets
.
append
(
valid_set
)
if
not
isinstance
(
valid_data
,
Dataset
):
raise
TypeError
(
"only can accept Dataset instance for traninig"
)
valid_data
.
set_reference
(
train_set
)
reduced_valid_sets
.
append
(
valid_data
)
if
valid_names
is
not
None
and
len
(
valid_names
)
>
i
:
name_valid_sets
.
append
(
valid_names
[
i
])
else
:
...
...
@@ -217,7 +152,7 @@ def train(params, train_data, num_boost_round=100,
booster
=
Booster
(
params
=
params
,
train_set
=
train_set
)
if
is_valid_contain_train
:
booster
.
set_train_data_name
(
train_data_name
)
for
valid_set
,
name_valid_set
in
zip
(
valid_sets
,
name_valid_sets
):
for
valid_set
,
name_valid_set
in
zip
(
reduced_
valid_sets
,
name_valid_sets
):
booster
.
add_valid
(
valid_set
,
name_valid_set
)
"""start training"""
...
...
@@ -294,6 +229,7 @@ def _make_n_folds(full_data, nfold, params, seed, fpreproc=None, stratified=Fals
else
:
raise
LightGBMError
(
'sklearn needs to be installed in order to use stratified cv'
)
else
:
full_data
.
construct
()
randidx
=
np
.
random
.
permutation
(
full_data
.
num_data
())
kstep
=
int
(
len
(
randidx
)
/
nfold
)
idset
=
[
randidx
[(
i
*
kstep
):
min
(
len
(
randidx
),
(
i
+
1
)
*
kstep
)]
for
i
in
range
(
nfold
)]
...
...
@@ -322,8 +258,8 @@ def _agg_cv_result(raw_results):
cvmap
[
one_line
[
1
]].
append
(
one_line
[
2
])
return
[(
'cv_agg'
,
k
,
np
.
mean
(
v
),
metric_type
[
k
],
np
.
std
(
v
))
for
k
,
v
in
cvmap
.
items
()]
def
cv
(
params
,
train_
data
,
num_boost_round
=
10
,
nfold
=
5
,
stratified
=
False
,
metrics
=
(),
fobj
=
None
,
feval
=
None
,
train_fields
=
None
,
def
cv
(
params
,
train_
set
,
num_boost_round
=
10
,
nfold
=
5
,
stratified
=
False
,
metrics
=
(),
fobj
=
None
,
feval
=
None
,
init_model
=
None
,
feature_name
=
None
,
categorical_feature
=
None
,
early_stopping_rounds
=
None
,
fpreproc
=
None
,
verbose_eval
=
None
,
show_stdv
=
True
,
seed
=
0
,
...
...
@@ -334,7 +270,7 @@ def cv(params, train_data, num_boost_round=10, nfold=5, stratified=False,
----------
params : dict
Booster params.
train_
data : tuple (X, y) or filename of data
train_
set : Dataset
Data to be trained.
num_boost_round : int
Number of boosting iterations.
...
...
@@ -350,9 +286,8 @@ def cv(params, train_data, num_boost_round=10, nfold=5, stratified=False,
Custom objective function.
feval : function
Custom evaluation function.
train_fields : dict
Other data file in training data. e.g. train_fields['weight'] is weight data
Support fields: weight, group, init_score
init_model : file name of lightgbm model or 'Booster' instance
model used for continued train
feature_name : list of str
Feature names
categorical_feature : list of str or int
...
...
@@ -382,6 +317,20 @@ def cv(params, train_data, num_boost_round=10, nfold=5, stratified=False,
-------
evaluation history : list(string)
"""
if
not
isinstance
(
train_set
,
Dataset
):
raise
TypeError
(
"only can accept Dataset instance for traninig"
)
if
is_str
(
init_model
):
predictor
=
_InnerPredictor
(
model_file
=
init_model
)
elif
isinstance
(
init_model
,
Booster
):
predictor
=
init_model
.
_to_predictor
()
else
:
predictor
=
None
train_set
.
_set_predictor
(
predictor
)
train_set
.
set_feature_name
(
feature_name
)
train_set
.
set_categorical_feature
(
categorical_feature
)
if
metrics
:
params
.
setdefault
(
'metric'
,
[])
if
is_str
(
metrics
):
...
...
@@ -389,11 +338,6 @@ def cv(params, train_data, num_boost_round=10, nfold=5, stratified=False,
else
:
params
[
'metric'
].
extend
(
metrics
)
train_set
=
_construct_dataset
(
train_data
,
None
,
params
,
other_fields
=
train_fields
,
feature_name
=
feature_name
,
categorical_feature
=
categorical_feature
)
results
=
collections
.
defaultdict
(
list
)
cvfolds
=
_make_n_folds
(
train_set
,
nfold
,
params
,
seed
,
fpreproc
,
stratified
)
...
...
python-package/lightgbm/libpath.py
View file @
b51c7be4
...
...
@@ -19,6 +19,7 @@ def find_lib_path():
if
os
.
name
==
'nt'
:
dll_path
.
append
(
os
.
path
.
join
(
curr_path
,
'../../windows/x64/Dll/'
))
dll_path
.
append
(
os
.
path
.
join
(
curr_path
,
'./windows/x64/Dll/'
))
dll_path
.
append
(
os
.
path
.
join
(
curr_path
,
'../../Release/'
))
dll_path
=
[
os
.
path
.
join
(
p
,
'lib_lightgbm.dll'
)
for
p
in
dll_path
]
else
:
dll_path
=
[
os
.
path
.
join
(
p
,
'lib_lightgbm.so'
)
for
p
in
dll_path
]
...
...
python-package/lightgbm/sklearn.py
View file @
b51c7be4
...
...
@@ -4,7 +4,7 @@
from
__future__
import
absolute_import
import
numpy
as
np
from
.basic
import
LightGBMError
,
is_str
from
.basic
import
LightGBMError
,
Dataset
,
is_str
from
.engine
import
train
# sklearn
try
:
...
...
@@ -195,9 +195,12 @@ class LGBMModel(LGBMModelBase):
params
.
pop
(
'nthread'
,
None
)
return
params
def
fit
(
self
,
X
,
y
,
eval_set
=
None
,
eval_metric
=
None
,
def
fit
(
self
,
X
,
y
,
sample_weight
=
None
,
init_score
=
None
,
group
=
None
,
eval_set
=
None
,
eval_sample_weight
=
None
,
eval_init_score
=
None
,
eval_group
=
None
,
eval_metric
=
None
,
early_stopping_rounds
=
None
,
verbose
=
True
,
train_fields
=
None
,
valid_fields
=
None
,
feature_name
=
None
,
categorical_feature
=
None
,
other_params
=
None
):
"""
...
...
@@ -209,24 +212,29 @@ class LGBMModel(LGBMModelBase):
Feature matrix
y : array_like
Labels
sample_weight : array_like
weight of training data
init_score : array_like
init score of training data
group : array_like
group data of training data
eval_set : list, optional
A list of (X, y) tuple pairs to use as a validation set for early-stopping
eval_sample_weight : List of array
weight of eval data
eval_init_score : List of array
init score of eval data
eval_group : List of array
group data of eval data
eval_metric : str, list of str, callable, optional
If a str, should be a built-in evaluation metric to use.
If callable, a custom evaluation metric. The call
\
signature is func(y_predicted, dataset) where dataset will be a
\
Dat
a
set
f
object such that you may need to call the get_label
\
Dat
e
set object such that you may need to call the get_label
\
method. And it must return (eval_name->str, eval_result->float, is_bigger_better->Bool)
early_stopping_rounds : int
verbose : bool
If `verbose` and an evaluation set is used, writes the evaluation
train_fields : dict
Other data file in training data. e.g. train_fields['weight'] is weight data
Support fields: weight, group, init_score
valid_fields : dict
Other data file in training data.
\
e.g. valid_fields[0]['weight'] is weight data for first valid data
Support fields: weight, group, init_score
feature_name : list of str
Feature names
categorical_feature : list of str or int
...
...
@@ -263,12 +271,33 @@ class LGBMModel(LGBMModelBase):
feval
=
None
feval
=
eval_metric
if
callable
(
eval_metric
)
else
None
self
.
_Booster
=
train
(
params
,
(
X
,
y
),
self
.
n_estimators
,
valid_datas
=
eval_set
,
def
_construct_dataset
(
X
,
y
,
sample_weight
,
init_score
,
group
):
ret
=
Dataset
(
X
,
label
=
y
,
weight
=
sample_weight
,
group
=
group
)
ret
.
set_init_score
(
init_score
)
return
ret
train_set
=
_construct_dataset
(
X
,
y
,
sample_weight
,
init_score
,
group
)
valid_sets
=
[]
if
eval_set
is
not
None
:
if
isinstance
(
eval_set
,
tuple
):
eval_set
=
[
eval_set
]
for
i
,
valid_data
in
enumerate
(
eval_set
):
"""reduce cost for prediction training data"""
if
valid_data
[
0
]
is
X
and
valid_data
[
1
]
is
y
:
valid_set
=
train_set
else
:
valid_weight
=
None
if
eval_sample_weight
is
None
else
eval_sample_weight
.
get
(
i
,
None
)
valid_init_score
=
None
if
eval_init_score
is
None
else
eval_init_score
.
get
(
i
,
None
)
valid_group
=
None
if
eval_group
is
None
else
eval_group
.
get
(
i
,
None
)
valid_set
=
_construct_dataset
(
valid_data
[
0
],
valid_data
[
1
],
valid_weight
,
valid_init_score
,
valid_group
)
valid_sets
.
append
(
valid_set
)
self
.
_Booster
=
train
(
params
,
train_set
,
self
.
n_estimators
,
valid_sets
=
valid_sets
,
early_stopping_rounds
=
early_stopping_rounds
,
evals_result
=
evals_result
,
fobj
=
self
.
fobj
,
feval
=
feval
,
verbose_eval
=
verbose
,
train_fields
=
train_fields
,
valid_fields
=
valid_fields
,
feature_name
=
feature_name
,
verbose_eval
=
verbose
,
feature_name
=
feature_name
,
categorical_feature
=
categorical_feature
)
if
evals_result
:
...
...
@@ -331,14 +360,48 @@ class LGBMRegressor(LGBMModel, LGBMRegressorBase):
__doc__
=
"""Implementation of the scikit-learn API for LightGBM regression.
"""
+
'
\n
'
.
join
(
LGBMModel
.
__doc__
.
split
(
'
\n
'
)[
2
:])
def
fit
(
self
,
X
,
y
,
sample_weight
=
None
,
init_score
=
None
,
eval_set
=
None
,
eval_sample_weight
=
None
,
eval_init_score
=
None
,
eval_metric
=
None
,
early_stopping_rounds
=
None
,
verbose
=
True
,
feature_name
=
None
,
categorical_feature
=
None
,
other_params
=
None
):
super
(
LGBMRegressor
,
self
).
fit
(
X
,
y
,
sample_weight
,
init_score
,
None
,
eval_set
,
eval_sample_weight
,
eval_init_score
,
None
,
eval_metric
,
early_stopping_rounds
,
verbose
,
feature_name
,
categorical_feature
,
other_params
)
return
self
class
LGBMClassifier
(
LGBMModel
,
LGBMClassifierBase
):
__doc__
=
"""Implementation of the scikit-learn API for LightGBM classification.
"""
+
'
\n
'
.
join
(
LGBMModel
.
__doc__
.
split
(
'
\n
'
)[
2
:])
def
fit
(
self
,
X
,
y
,
eval_set
=
None
,
eval_metric
=
None
,
def
__init__
(
self
,
num_leaves
=
31
,
max_depth
=-
1
,
learning_rate
=
0.1
,
n_estimators
=
10
,
max_bin
=
255
,
silent
=
True
,
objective
=
"binary"
,
nthread
=-
1
,
min_split_gain
=
0
,
min_child_weight
=
5
,
min_child_samples
=
10
,
subsample
=
1
,
subsample_freq
=
1
,
colsample_bytree
=
1
,
reg_alpha
=
0
,
reg_lambda
=
0
,
scale_pos_weight
=
1
,
is_unbalance
=
False
,
seed
=
0
):
super
(
LGBMClassifier
,
self
).
__init__
(
num_leaves
,
max_depth
,
learning_rate
,
n_estimators
,
max_bin
,
silent
,
objective
,
nthread
,
min_split_gain
,
min_child_weight
,
min_child_samples
,
subsample
,
subsample_freq
,
colsample_bytree
,
reg_alpha
,
reg_lambda
,
scale_pos_weight
,
is_unbalance
,
seed
)
def
fit
(
self
,
X
,
y
,
sample_weight
=
None
,
init_score
=
None
,
eval_set
=
None
,
eval_sample_weight
=
None
,
eval_init_score
=
None
,
eval_metric
=
None
,
early_stopping_rounds
=
None
,
verbose
=
True
,
train_fields
=
None
,
valid_fields
=
None
,
feature_name
=
None
,
categorical_feature
=
None
,
other_params
=
None
):
...
...
@@ -350,12 +413,6 @@ class LGBMClassifier(LGBMModel, LGBMClassifierBase):
# Switch to using a multiclass objective in the underlying LGBM instance
self
.
objective
=
"multiclass"
other_params
[
'num_class'
]
=
self
.
n_classes_
if
eval_metric
is
None
and
eval_set
is
not
None
:
eval_metric
=
"multi_logloss"
else
:
self
.
objective
=
"binary"
if
eval_metric
is
None
and
eval_set
is
not
None
:
eval_metric
=
"binary_logloss"
self
.
_le
=
LGBMLabelEncoder
().
fit
(
y
)
training_labels
=
self
.
_le
.
transform
(
y
)
...
...
@@ -363,10 +420,10 @@ class LGBMClassifier(LGBMModel, LGBMClassifierBase):
if
eval_set
is
not
None
:
eval_set
=
list
((
x
[
0
],
self
.
_le
.
transform
(
x
[
1
]))
for
x
in
eval_set
)
super
(
LGBMClassifier
,
self
).
fit
(
X
,
training_labels
,
eval_set
,
super
(
LGBMClassifier
,
self
).
fit
(
X
,
training_labels
,
sample_weight
,
init_score
,
None
,
eval_set
,
eval_sample_weight
,
eval_init_score
,
None
,
eval_metric
,
early_stopping_rounds
,
verbose
,
train_fields
,
valid_fields
,
feature_name
,
categorical_feature
,
verbose
,
feature_name
,
categorical_feature
,
other_params
)
return
self
...
...
@@ -442,34 +499,59 @@ class LGBMRanker(LGBMModel):
"""
+
'
\n
'
.
join
(
LGBMModel
.
__doc__
.
split
(
'
\n
'
)[
2
:])
def
fit
(
self
,
X
,
y
,
eval_set
=
None
,
eval_metric
=
None
,
def
__init__
(
self
,
num_leaves
=
31
,
max_depth
=-
1
,
learning_rate
=
0.1
,
n_estimators
=
10
,
max_bin
=
255
,
silent
=
True
,
objective
=
"lambdarank"
,
nthread
=-
1
,
min_split_gain
=
0
,
min_child_weight
=
5
,
min_child_samples
=
10
,
subsample
=
1
,
subsample_freq
=
1
,
colsample_bytree
=
1
,
reg_alpha
=
0
,
reg_lambda
=
0
,
scale_pos_weight
=
1
,
is_unbalance
=
False
,
seed
=
0
):
super
(
LGBMRanker
,
self
).
__init__
(
num_leaves
,
max_depth
,
learning_rate
,
n_estimators
,
max_bin
,
silent
,
objective
,
nthread
,
min_split_gain
,
min_child_weight
,
min_child_samples
,
subsample
,
subsample_freq
,
colsample_bytree
,
reg_alpha
,
reg_lambda
,
scale_pos_weight
,
is_unbalance
,
seed
)
if
callable
(
self
.
objective
):
self
.
fobj
=
_group_wise_objective
(
self
.
objective
)
else
:
self
.
fobj
=
None
def
fit
(
self
,
X
,
y
,
sample_weight
=
None
,
init_score
=
None
,
group
=
None
,
eval_set
=
None
,
eval_sample_weight
=
None
,
eval_init_score
=
None
,
eval_group
=
None
,
eval_metric
=
None
,
eval_at
=
None
,
early_stopping_rounds
=
None
,
verbose
=
True
,
train_fields
=
None
,
valid_fields
=
None
,
other_params
=
None
):
feature_name
=
None
,
categorical_feature
=
None
,
other_params
=
None
):
"""
Most arguments like LGBMModel.fit except following:
eval_at : list of int
The evaulation positions of NDCG
"""
"""check group data"""
if
"
group
"
not
in
train_fields
:
raise
ValueError
(
"should se
t
group
in train_fields
for ranking task"
)
if
group
is
None
:
raise
ValueError
(
"should
u
se group for ranking task"
)
if
eval_set
is
not
None
:
if
val
id_fields
is
None
:
raise
ValueError
(
"val
id_fields
cannot be None when eval_set is not None"
)
elif
len
(
val
id_fields
)
!=
len
(
eval_set
):
raise
ValueError
(
"leng
h
t of val
id_fields
should equal with eval_set"
)
if
e
val
_group
is
None
:
raise
ValueError
(
"
e
val
_group
cannot be None when eval_set is not None"
)
elif
len
(
e
val
_group
)
!=
len
(
eval_set
):
raise
ValueError
(
"lengt
h
of
e
val
_group
should equal with eval_set"
)
else
:
for
inner
in
valid_fields
:
if
"group"
not
in
inner
:
raise
ValueError
(
"should set group in valid_fields for ranking task"
)
if
callable
(
self
.
objective
):
self
.
fobj
=
_group_wise_objective
(
self
.
objective
)
else
:
self
.
objective
=
"lambdarank"
self
.
fobj
=
None
if
eval_metric
is
None
and
eval_set
is
not
None
:
eval_metric
=
"ndcg"
super
(
LGBMRanker
,
self
).
fit
(
X
,
y
,
eval_set
,
eval_metric
,
early_stopping_rounds
,
verbose
,
train_fields
,
valid_fields
,
for
inner_group
in
eval_group
:
if
inner_group
is
None
:
raise
ValueError
(
"should set group for all eval data for ranking task"
)
if
eval_at
is
not
None
:
other_params
=
{}
if
other_params
is
None
else
other_params
other_params
[
'ndcg_eval_at'
]
=
list
(
eval_at
)
super
(
LGBMRanker
,
self
).
fit
(
X
,
y
,
sample_weight
,
init_score
,
group
,
eval_set
,
eval_sample_weight
,
eval_init_score
,
eval_group
,
eval_metric
,
early_stopping_rounds
,
verbose
,
feature_name
,
categorical_feature
,
other_params
)
return
self
src/io/config.cpp
View file @
b51c7be4
#include <LightGBM/config.h>
#include <LightGBM/utils/common.h>
#include <LightGBM/utils/random.h>
#include <LightGBM/utils/log.h>
#include <vector>
#include <string>
#include <unordered_set>
#include <algorithm>
#include <limits>
namespace
LightGBM
{
...
...
@@ -22,7 +24,7 @@ std::unordered_map<std::string, std::string> ConfigBase::Str2Map(const char* par
continue
;
}
params
[
key
]
=
value
;
}
else
{
}
else
if
(
Common
::
Trim
(
arg
).
size
()
>
0
)
{
Log
::
Warning
(
"Unknown parameter %s"
,
arg
.
c_str
());
}
}
...
...
@@ -33,12 +35,21 @@ std::unordered_map<std::string, std::string> ConfigBase::Str2Map(const char* par
void
OverallConfig
::
Set
(
const
std
::
unordered_map
<
std
::
string
,
std
::
string
>&
params
)
{
// load main config types
GetInt
(
params
,
"num_threads"
,
&
num_threads
);
// generate seeds by seed.
if
(
GetInt
(
params
,
"seed"
,
&
seed
))
{
Random
rand
(
seed
);
int
int_max
=
std
::
numeric_limits
<
int
>::
max
();
io_config
.
data_random_seed
=
static_cast
<
int
>
(
rand
.
NextInt
(
0
,
int_max
));
boosting_config
.
bagging_seed
=
static_cast
<
int
>
(
rand
.
NextInt
(
0
,
int_max
));
boosting_config
.
drop_seed
=
static_cast
<
int
>
(
rand
.
NextInt
(
0
,
int_max
));
boosting_config
.
tree_config
.
feature_fraction_seed
=
static_cast
<
int
>
(
rand
.
NextInt
(
0
,
int_max
));
}
GetTaskType
(
params
);
GetBoostingType
(
params
);
GetObjectiveType
(
params
);
GetMetricType
(
params
);
// sub-config setup
network_config
.
Set
(
params
);
io_config
.
Set
(
params
);
...
...
tests/python_package_test/test_basic.py
View file @
b51c7be4
...
...
@@ -8,10 +8,6 @@ x_train, x_test, y_train, y_test = model_selection.train_test_split(X, Y, test_s
train_data
=
lgb
.
Dataset
(
x_train
,
max_bin
=
255
,
label
=
y_train
)
num_features
=
train_data
.
num_feature
()
names
=
[
"name_%d"
%
(
i
)
for
i
in
range
(
num_features
)]
train_data
.
set_feature_name
(
names
)
valid_data
=
train_data
.
create_valid
(
x_test
,
label
=
y_test
)
config
=
{
"objective"
:
"binary"
,
"metric"
:
"auc"
,
"min_data"
:
1
,
"num_leaves"
:
15
}
...
...
tests/python_package_test/test_engine.py
0 → 100644
View file @
b51c7be4
# coding: utf-8
# pylint: disable = invalid-name, C0111
import
json
import
lightgbm
as
lgb
import
pandas
as
pd
from
sklearn.metrics
import
mean_squared_error
# load or create your dataset
df_train
=
pd
.
read_csv
(
'../../examples/regression/regression.train'
,
header
=
None
,
sep
=
'
\t
'
)
df_test
=
pd
.
read_csv
(
'../../examples/regression/regression.test'
,
header
=
None
,
sep
=
'
\t
'
)
y_train
=
df_train
[
0
]
y_test
=
df_test
[
0
]
X_train
=
df_train
.
drop
(
0
,
axis
=
1
)
X_test
=
df_test
.
drop
(
0
,
axis
=
1
)
# create dataset for lightgbm
lgb_train
=
lgb
.
Dataset
(
X_train
,
y_train
,
free_raw_data
=
False
)
lgb_eval
=
lgb
.
Dataset
(
X_test
,
y_test
,
reference
=
lgb_train
,
free_raw_data
=
False
)
# specify your configurations as a dict
params
=
{
'task'
:
'train'
,
'boosting_type'
:
'gbdt'
,
'objective'
:
'regression'
,
'metric'
:
{
'l2'
,
'auc'
},
'num_leaves'
:
31
,
'learning_rate'
:
0.05
,
'feature_fraction'
:
0.9
,
'bagging_fraction'
:
0.8
,
'bagging_freq'
:
5
,
'verbose'
:
0
}
# train
init_gbm
=
lgb
.
train
(
params
,
lgb_train
,
num_boost_round
=
5
,
valid_sets
=
lgb_eval
)
print
(
'Start continue train'
)
gbm
=
lgb
.
train
(
params
,
lgb_train
,
num_boost_round
=
100
,
valid_sets
=
lgb_eval
,
early_stopping_rounds
=
10
,
init_model
=
init_gbm
)
# save model to file
gbm
.
save_model
(
'model.txt'
)
# predict
y_pred
=
gbm
.
predict
(
X_test
,
num_iteration
=
gbm
.
best_iteration
)
# eval
print
(
'The rmse of prediction is:'
,
mean_squared_error
(
y_test
,
y_pred
)
**
0.5
)
# dump model to json (and save to file)
model_json
=
gbm
.
dump_model
()
with
open
(
'model.json'
,
'w+'
)
as
f
:
json
.
dump
(
model_json
,
f
,
indent
=
4
)
# feature importances
print
(
'Feature importances:'
,
gbm
.
feature_importance
())
print
(
'Feature importances:'
,
gbm
.
feature_importance
(
"gain"
))
print
(
'Start test cv'
)
lgb
.
cv
(
params
,
lgb_train
,
num_boost_round
=
100
,
nfold
=
5
,
verbose_eval
=
5
,
init_model
=
init_gbm
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment