Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
tianlh
LightGBM-DCU
Commits
1e61f24f
"git@developer.sourcefind.cn:tianlh/lightgbm-dcu.git" did not exist on "1ee7c2927e68b730a3cd48486f239a1456002cea"
Unverified
Commit
1e61f24f
authored
Jan 24, 2018
by
Guolin Ke
Committed by
GitHub
Jan 24, 2018
Browse files
try to fix problem with multi-dimensional sliced object. (#1210)
parent
61fb5ea2
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
82 additions
and
20 deletions
+82
-20
python-package/lightgbm/basic.py
python-package/lightgbm/basic.py
+31
-20
tests/python_package_test/test_engine.py
tests/python_package_test/test_engine.py
+51
-0
No files found.
python-package/lightgbm/basic.py
View file @
1e61f24f
...
@@ -179,11 +179,22 @@ FIELD_TYPE_MAPPER = {"label": C_API_DTYPE_FLOAT32,
...
@@ -179,11 +179,22 @@ FIELD_TYPE_MAPPER = {"label": C_API_DTYPE_FLOAT32,
"group"
:
C_API_DTYPE_INT32
}
"group"
:
C_API_DTYPE_INT32
}
def
convert_from_sliced_object
(
data
):
"""fix the memory of multi-dimensional sliced object"""
if
data
.
base
is
not
None
and
isinstance
(
data
,
np
.
ndarray
)
and
isinstance
(
data
.
base
,
np
.
ndarray
):
if
not
data
.
flags
.
c_contiguous
:
warnings
.
warn
(
"Use subset(sliced data) of np.ndarray is not recommended due to it will double the peak memory cost in LightGBM."
)
return
np
.
copy
(
data
)
return
data
def
c_float_array
(
data
):
def
c_float_array
(
data
):
"""get pointer of float numpy array / list"""
"""get pointer of float numpy array / list"""
if
is_1d_list
(
data
):
if
is_1d_list
(
data
):
data
=
np
.
array
(
data
,
copy
=
False
)
data
=
np
.
array
(
data
,
copy
=
False
)
if
is_numpy_1d_array
(
data
):
if
is_numpy_1d_array
(
data
):
data
=
convert_from_sliced_object
(
data
)
assert
data
.
flags
.
c_contiguous
if
data
.
dtype
==
np
.
float32
:
if
data
.
dtype
==
np
.
float32
:
ptr_data
=
data
.
ctypes
.
data_as
(
ctypes
.
POINTER
(
ctypes
.
c_float
))
ptr_data
=
data
.
ctypes
.
data_as
(
ctypes
.
POINTER
(
ctypes
.
c_float
))
type_data
=
C_API_DTYPE_FLOAT32
type_data
=
C_API_DTYPE_FLOAT32
...
@@ -195,7 +206,7 @@ def c_float_array(data):
...
@@ -195,7 +206,7 @@ def c_float_array(data):
.
format
(
data
.
dtype
))
.
format
(
data
.
dtype
))
else
:
else
:
raise
TypeError
(
"Unknown type({})"
.
format
(
type
(
data
).
__name__
))
raise
TypeError
(
"Unknown type({})"
.
format
(
type
(
data
).
__name__
))
return
(
ptr_data
,
type_data
)
return
(
ptr_data
,
type_
data
,
data
)
def
c_int_array
(
data
):
def
c_int_array
(
data
):
...
@@ -203,6 +214,8 @@ def c_int_array(data):
...
@@ -203,6 +214,8 @@ def c_int_array(data):
if
is_1d_list
(
data
):
if
is_1d_list
(
data
):
data
=
np
.
array
(
data
,
copy
=
False
)
data
=
np
.
array
(
data
,
copy
=
False
)
if
is_numpy_1d_array
(
data
):
if
is_numpy_1d_array
(
data
):
data
=
convert_from_sliced_object
(
data
)
assert
data
.
flags
.
c_contiguous
if
data
.
dtype
==
np
.
int32
:
if
data
.
dtype
==
np
.
int32
:
ptr_data
=
data
.
ctypes
.
data_as
(
ctypes
.
POINTER
(
ctypes
.
c_int32
))
ptr_data
=
data
.
ctypes
.
data_as
(
ctypes
.
POINTER
(
ctypes
.
c_int32
))
type_data
=
C_API_DTYPE_INT32
type_data
=
C_API_DTYPE_INT32
...
@@ -214,7 +227,7 @@ def c_int_array(data):
...
@@ -214,7 +227,7 @@ def c_int_array(data):
.
format
(
data
.
dtype
))
.
format
(
data
.
dtype
))
else
:
else
:
raise
TypeError
(
"Unknown type({})"
.
format
(
type
(
data
).
__name__
))
raise
TypeError
(
"Unknown type({})"
.
format
(
type
(
data
).
__name__
))
return
(
ptr_data
,
type_data
)
return
(
ptr_data
,
type_
data
,
data
)
PANDAS_DTYPE_MAPPER
=
{
'int8'
:
'int'
,
'int16'
:
'int'
,
'int32'
:
'int'
,
PANDAS_DTYPE_MAPPER
=
{
'int8'
:
'int'
,
'int16'
:
'int'
,
'int32'
:
'int'
,
...
@@ -472,7 +485,7 @@ class _InnerPredictor(object):
...
@@ -472,7 +485,7 @@ class _InnerPredictor(object):
else
:
else
:
"""change non-float data to float data, need to copy"""
"""change non-float data to float data, need to copy"""
data
=
np
.
array
(
mat
.
reshape
(
mat
.
size
),
dtype
=
np
.
float32
)
data
=
np
.
array
(
mat
.
reshape
(
mat
.
size
),
dtype
=
np
.
float32
)
ptr_data
,
type_ptr_data
=
c_float_array
(
data
)
ptr_data
,
type_ptr_data
,
_
=
c_float_array
(
data
)
n_preds
=
self
.
__get_num_preds
(
num_iteration
,
mat
.
shape
[
0
],
n_preds
=
self
.
__get_num_preds
(
num_iteration
,
mat
.
shape
[
0
],
predict_type
)
predict_type
)
preds
=
np
.
zeros
(
n_preds
,
dtype
=
np
.
float64
)
preds
=
np
.
zeros
(
n_preds
,
dtype
=
np
.
float64
)
...
@@ -502,8 +515,8 @@ class _InnerPredictor(object):
...
@@ -502,8 +515,8 @@ class _InnerPredictor(object):
preds
=
np
.
zeros
(
n_preds
,
dtype
=
np
.
float64
)
preds
=
np
.
zeros
(
n_preds
,
dtype
=
np
.
float64
)
out_num_preds
=
ctypes
.
c_int64
(
0
)
out_num_preds
=
ctypes
.
c_int64
(
0
)
ptr_indptr
,
type_ptr_indptr
=
c_int_array
(
csr
.
indptr
)
ptr_indptr
,
type_ptr_indptr
,
__
=
c_int_array
(
csr
.
indptr
)
ptr_data
,
type_ptr_data
=
c_float_array
(
csr
.
data
)
ptr_data
,
type_ptr_data
,
_
=
c_float_array
(
csr
.
data
)
_safe_call
(
_LIB
.
LGBM_BoosterPredictForCSR
(
_safe_call
(
_LIB
.
LGBM_BoosterPredictForCSR
(
self
.
handle
,
self
.
handle
,
...
@@ -533,8 +546,8 @@ class _InnerPredictor(object):
...
@@ -533,8 +546,8 @@ class _InnerPredictor(object):
preds
=
np
.
zeros
(
n_preds
,
dtype
=
np
.
float64
)
preds
=
np
.
zeros
(
n_preds
,
dtype
=
np
.
float64
)
out_num_preds
=
ctypes
.
c_int64
(
0
)
out_num_preds
=
ctypes
.
c_int64
(
0
)
ptr_indptr
,
type_ptr_indptr
=
c_int_array
(
csc
.
indptr
)
ptr_indptr
,
type_ptr_indptr
,
__
=
c_int_array
(
csc
.
indptr
)
ptr_data
,
type_ptr_data
=
c_float_array
(
csc
.
data
)
ptr_data
,
type_ptr_data
,
_
=
c_float_array
(
csc
.
data
)
_safe_call
(
_LIB
.
LGBM_BoosterPredictForCSC
(
_safe_call
(
_LIB
.
LGBM_BoosterPredictForCSC
(
self
.
handle
,
self
.
handle
,
...
@@ -747,7 +760,7 @@ class Dataset(object):
...
@@ -747,7 +760,7 @@ class Dataset(object):
# change non-float data to float data, need to copy
# change non-float data to float data, need to copy
data
=
np
.
array
(
mat
.
reshape
(
mat
.
size
),
dtype
=
np
.
float32
)
data
=
np
.
array
(
mat
.
reshape
(
mat
.
size
),
dtype
=
np
.
float32
)
ptr_data
,
type_ptr_data
=
c_float_array
(
data
)
ptr_data
,
type_ptr_data
,
_
=
c_float_array
(
data
)
_safe_call
(
_LIB
.
LGBM_DatasetCreateFromMat
(
_safe_call
(
_LIB
.
LGBM_DatasetCreateFromMat
(
ptr_data
,
ptr_data
,
ctypes
.
c_int
(
type_ptr_data
),
ctypes
.
c_int
(
type_ptr_data
),
...
@@ -766,8 +779,8 @@ class Dataset(object):
...
@@ -766,8 +779,8 @@ class Dataset(object):
raise
ValueError
(
'Length mismatch: {} vs {}'
.
format
(
len
(
csr
.
indices
),
len
(
csr
.
data
)))
raise
ValueError
(
'Length mismatch: {} vs {}'
.
format
(
len
(
csr
.
indices
),
len
(
csr
.
data
)))
self
.
handle
=
ctypes
.
c_void_p
()
self
.
handle
=
ctypes
.
c_void_p
()
ptr_indptr
,
type_ptr_indptr
=
c_int_array
(
csr
.
indptr
)
ptr_indptr
,
type_ptr_indptr
,
__
=
c_int_array
(
csr
.
indptr
)
ptr_data
,
type_ptr_data
=
c_float_array
(
csr
.
data
)
ptr_data
,
type_ptr_data
,
_
=
c_float_array
(
csr
.
data
)
_safe_call
(
_LIB
.
LGBM_DatasetCreateFromCSR
(
_safe_call
(
_LIB
.
LGBM_DatasetCreateFromCSR
(
ptr_indptr
,
ptr_indptr
,
...
@@ -790,8 +803,8 @@ class Dataset(object):
...
@@ -790,8 +803,8 @@ class Dataset(object):
raise
ValueError
(
'Length mismatch: {} vs {}'
.
format
(
len
(
csc
.
indices
),
len
(
csc
.
data
)))
raise
ValueError
(
'Length mismatch: {} vs {}'
.
format
(
len
(
csc
.
indices
),
len
(
csc
.
data
)))
self
.
handle
=
ctypes
.
c_void_p
()
self
.
handle
=
ctypes
.
c_void_p
()
ptr_indptr
,
type_ptr_indptr
=
c_int_array
(
csc
.
indptr
)
ptr_indptr
,
type_ptr_indptr
,
__
=
c_int_array
(
csc
.
indptr
)
ptr_data
,
type_ptr_data
=
c_float_array
(
csc
.
data
)
ptr_data
,
type_ptr_data
,
_
=
c_float_array
(
csc
.
data
)
_safe_call
(
_LIB
.
LGBM_DatasetCreateFromCSC
(
_safe_call
(
_LIB
.
LGBM_DatasetCreateFromCSC
(
ptr_indptr
,
ptr_indptr
,
...
@@ -824,6 +837,7 @@ class Dataset(object):
...
@@ -824,6 +837,7 @@ class Dataset(object):
else
:
else
:
# construct subset
# construct subset
used_indices
=
list_to_1d_numpy
(
self
.
used_indices
,
np
.
int32
,
name
=
'used_indices'
)
used_indices
=
list_to_1d_numpy
(
self
.
used_indices
,
np
.
int32
,
name
=
'used_indices'
)
assert
used_indices
.
flags
.
c_contiguous
self
.
handle
=
ctypes
.
c_void_p
()
self
.
handle
=
ctypes
.
c_void_p
()
params_str
=
param_dict_to_str
(
self
.
params
)
params_str
=
param_dict_to_str
(
self
.
params
)
_safe_call
(
_LIB
.
LGBM_DatasetGetSubset
(
_safe_call
(
_LIB
.
LGBM_DatasetGetSubset
(
...
@@ -952,15 +966,10 @@ class Dataset(object):
...
@@ -952,15 +966,10 @@ class Dataset(object):
elif
field_name
==
'init_score'
:
elif
field_name
==
'init_score'
:
dtype
=
np
.
float64
dtype
=
np
.
float64
data
=
list_to_1d_numpy
(
data
,
dtype
,
name
=
field_name
)
data
=
list_to_1d_numpy
(
data
,
dtype
,
name
=
field_name
)
if
data
.
dtype
==
np
.
float32
:
if
data
.
dtype
==
np
.
float32
or
data
.
dtype
==
np
.
float64
:
ptr_data
=
data
.
ctypes
.
data_as
(
ctypes
.
POINTER
(
ctypes
.
c_float
))
ptr_data
,
type_data
,
_
=
c_float_array
(
data
)
type_data
=
C_API_DTYPE_FLOAT32
elif
data
.
dtype
==
np
.
float64
:
ptr_data
=
data
.
ctypes
.
data_as
(
ctypes
.
POINTER
(
ctypes
.
c_double
))
type_data
=
C_API_DTYPE_FLOAT64
elif
data
.
dtype
==
np
.
int32
:
elif
data
.
dtype
==
np
.
int32
:
ptr_data
=
data
.
ctypes
.
data_as
(
ctypes
.
POINTER
(
ctypes
.
c_int32
))
ptr_data
,
type_data
,
_
=
c_int_array
(
data
)
type_data
=
C_API_DTYPE_INT32
else
:
else
:
raise
TypeError
(
"Excepted np.float32/64 or np.int32, meet type({})"
.
format
(
data
.
dtype
))
raise
TypeError
(
"Excepted np.float32/64 or np.int32, meet type({})"
.
format
(
data
.
dtype
))
if
type_data
!=
FIELD_TYPE_MAPPER
[
field_name
]:
if
type_data
!=
FIELD_TYPE_MAPPER
[
field_name
]:
...
@@ -1536,6 +1545,8 @@ class Booster(object):
...
@@ -1536,6 +1545,8 @@ class Booster(object):
"""
"""
grad
=
list_to_1d_numpy
(
grad
,
name
=
'gradient'
)
grad
=
list_to_1d_numpy
(
grad
,
name
=
'gradient'
)
hess
=
list_to_1d_numpy
(
hess
,
name
=
'hessian'
)
hess
=
list_to_1d_numpy
(
hess
,
name
=
'hessian'
)
assert
grad
.
flags
.
c_contiguous
assert
hess
.
flags
.
c_contiguous
if
len
(
grad
)
!=
len
(
hess
):
if
len
(
grad
)
!=
len
(
hess
):
raise
ValueError
(
"Lengths of gradient({}) and hessian({}) don't match"
.
format
(
len
(
grad
),
len
(
hess
)))
raise
ValueError
(
"Lengths of gradient({}) and hessian({}) don't match"
.
format
(
len
(
grad
),
len
(
hess
)))
is_finished
=
ctypes
.
c_int
(
0
)
is_finished
=
ctypes
.
c_int
(
0
)
...
...
tests/python_package_test/test_engine.py
View file @
1e61f24f
...
@@ -12,6 +12,7 @@ from sklearn.datasets import (load_boston, load_breast_cancer, load_digits,
...
@@ -12,6 +12,7 @@ from sklearn.datasets import (load_boston, load_breast_cancer, load_digits,
load_iris
,
load_svmlight_file
)
load_iris
,
load_svmlight_file
)
from
sklearn.metrics
import
log_loss
,
mean_absolute_error
,
mean_squared_error
from
sklearn.metrics
import
log_loss
,
mean_absolute_error
,
mean_squared_error
from
sklearn.model_selection
import
train_test_split
,
TimeSeriesSplit
from
sklearn.model_selection
import
train_test_split
,
TimeSeriesSplit
from
scipy.sparse
import
csr_matrix
try
:
try
:
import
pandas
as
pd
import
pandas
as
pd
...
@@ -548,3 +549,53 @@ class TestEngine(unittest.TestCase):
...
@@ -548,3 +549,53 @@ class TestEngine(unittest.TestCase):
evals_result
=
evals_result
)
evals_result
=
evals_result
)
self
.
assertLess
(
np
.
linalg
.
norm
(
gbm
.
predict
(
X_test
,
raw_score
=
True
)
-
np
.
sum
(
gbm
.
predict
(
X_test
,
pred_contrib
=
True
),
axis
=
1
)),
1e-4
)
self
.
assertLess
(
np
.
linalg
.
norm
(
gbm
.
predict
(
X_test
,
raw_score
=
True
)
-
np
.
sum
(
gbm
.
predict
(
X_test
,
pred_contrib
=
True
),
axis
=
1
)),
1e-4
)
def
test_sliced_data
(
self
):
def
train_and_get_predictions
(
features
,
labels
):
dataset
=
lgb
.
Dataset
(
features
,
label
=
labels
)
lgb_params
=
{
'application'
:
'binary'
,
'verbose'
:
-
1
,
'min_data'
:
5
,
}
lgbm_model
=
lgb
.
train
(
params
=
lgb_params
,
train_set
=
dataset
,
num_boost_round
=
10
,
)
predictions
=
lgbm_model
.
predict
(
features
)
return
predictions
num_samples
=
100
features
=
np
.
random
.
rand
(
num_samples
,
5
)
positive_samples
=
int
(
num_samples
*
0.25
)
labels
=
np
.
append
(
np
.
ones
(
positive_samples
,
dtype
=
np
.
float32
),
np
.
zeros
(
num_samples
-
positive_samples
,
dtype
=
np
.
float32
),
)
# test sliced labels
origin_pred
=
train_and_get_predictions
(
features
,
labels
)
stacked_labels
=
np
.
column_stack
((
labels
,
np
.
ones
(
num_samples
,
dtype
=
np
.
float32
)))
sliced_labels
=
stacked_labels
[:,
0
]
sliced_pred
=
train_and_get_predictions
(
features
,
sliced_labels
)
np
.
testing
.
assert_almost_equal
(
origin_pred
,
sliced_pred
)
# append some columns
stacked_features
=
np
.
column_stack
((
np
.
ones
(
num_samples
,
dtype
=
np
.
float32
),
features
))
stacked_features
=
np
.
column_stack
((
np
.
ones
(
num_samples
,
dtype
=
np
.
float32
),
stacked_features
))
stacked_features
=
np
.
column_stack
((
stacked_features
,
np
.
ones
(
num_samples
,
dtype
=
np
.
float32
)))
stacked_features
=
np
.
column_stack
((
stacked_features
,
np
.
ones
(
num_samples
,
dtype
=
np
.
float32
)))
# append some rows
stacked_features
=
np
.
concatenate
((
np
.
ones
(
9
,
dtype
=
np
.
float32
).
reshape
((
1
,
9
)),
stacked_features
),
axis
=
0
)
stacked_features
=
np
.
concatenate
((
np
.
ones
(
9
,
dtype
=
np
.
float32
).
reshape
((
1
,
9
)),
stacked_features
),
axis
=
0
)
stacked_features
=
np
.
concatenate
((
stacked_features
,
np
.
ones
(
9
,
dtype
=
np
.
float32
).
reshape
((
1
,
9
))),
axis
=
0
)
stacked_features
=
np
.
concatenate
((
stacked_features
,
np
.
ones
(
9
,
dtype
=
np
.
float32
).
reshape
((
1
,
9
))),
axis
=
0
)
# test sliced 2d matrix
sliced_features
=
stacked_features
[
2
:
102
,
2
:
7
]
assert
np
.
all
(
sliced_features
==
features
)
sliced_pred
=
train_and_get_predictions
(
sliced_features
,
sliced_labels
)
np
.
testing
.
assert_almost_equal
(
origin_pred
,
sliced_pred
)
# test sliced CSR
stacked_csr
=
csr_matrix
(
stacked_features
)
sliced_csr
=
stacked_csr
[
2
:
102
,
2
:
7
]
assert
np
.
all
(
sliced_csr
==
features
)
sliced_pred
=
train_and_get_predictions
(
sliced_csr
,
sliced_labels
)
np
.
testing
.
assert_almost_equal
(
origin_pred
,
sliced_pred
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment