Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
tianlh
LightGBM-DCU
Commits
5df9584f
Commit
5df9584f
authored
Jul 07, 2018
by
Fedor Korotkiy
Committed by
Guolin Ke
Jul 07, 2018
Browse files
Support creating Dataset from list of matrices (#1474)
parent
2e93cdab
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
146 additions
and
19 deletions
+146
-19
include/LightGBM/c_api.h
include/LightGBM/c_api.h
+21
-0
python-package/lightgbm/basic.py
python-package/lightgbm/basic.py
+50
-0
src/c_api.cpp
src/c_api.cpp
+62
-19
tests/python_package_test/test_basic.py
tests/python_package_test/test_basic.py
+13
-0
No files found.
include/LightGBM/c_api.h
View file @
5df9584f
...
@@ -204,6 +204,27 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetCreateFromMat(const void* data,
...
@@ -204,6 +204,27 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetCreateFromMat(const void* data,
const
DatasetHandle
reference
,
const
DatasetHandle
reference
,
DatasetHandle
*
out
);
DatasetHandle
*
out
);
/*!
* \brief create dataset from array of dense matrices
* \param data pointer to the data space
* \param data_type type of data pointer, can be C_API_DTYPE_FLOAT32 or C_API_DTYPE_FLOAT64
* \param nrow number of rows
* \param ncol number columns
* \param parameters additional parameters
* \param reference used to align bin mapper with other dataset, nullptr means don't used
* \param out created dataset
* \return 0 when succeed, -1 when failure happens
*/
LIGHTGBM_C_EXPORT
int
LGBM_DatasetCreateFromMats
(
int32_t
nmat
,
const
void
**
data
,
int
data_type
,
int32_t
*
nrow
,
int32_t
ncol
,
int
is_row_major
,
const
char
*
parameters
,
const
DatasetHandle
reference
,
DatasetHandle
*
out
);
/*!
/*!
* \brief Create subset of a data
* \brief Create subset of a data
* \param handle handle of full dataset
* \param handle handle of full dataset
...
...
python-package/lightgbm/basic.py
View file @
5df9584f
...
@@ -712,6 +712,8 @@ class Dataset(object):
...
@@ -712,6 +712,8 @@ class Dataset(object):
self
.
__init_from_csc
(
data
,
params_str
,
ref_dataset
)
self
.
__init_from_csc
(
data
,
params_str
,
ref_dataset
)
elif
isinstance
(
data
,
np
.
ndarray
):
elif
isinstance
(
data
,
np
.
ndarray
):
self
.
__init_from_np2d
(
data
,
params_str
,
ref_dataset
)
self
.
__init_from_np2d
(
data
,
params_str
,
ref_dataset
)
elif
isinstance
(
data
,
list
)
and
len
(
data
)
>
0
and
all
(
isinstance
(
x
,
np
.
ndarray
)
for
x
in
data
):
self
.
__init_from_list_np2d
(
data
,
params_str
,
ref_dataset
)
else
:
else
:
try
:
try
:
csr
=
scipy
.
sparse
.
csr_matrix
(
data
)
csr
=
scipy
.
sparse
.
csr_matrix
(
data
)
...
@@ -775,6 +777,54 @@ class Dataset(object):
...
@@ -775,6 +777,54 @@ class Dataset(object):
ref_dataset
,
ref_dataset
,
ctypes
.
byref
(
self
.
handle
)))
ctypes
.
byref
(
self
.
handle
)))
def
__init_from_list_np2d
(
self
,
mats
,
params_str
,
ref_dataset
):
"""
Initialize data from list of 2-D numpy matrices.
"""
ncol
=
mats
[
0
].
shape
[
1
]
nrow
=
np
.
zeros
((
len
(
mats
),),
np
.
int32
)
if
mats
[
0
].
dtype
==
np
.
float64
:
ptr_data
=
(
ctypes
.
POINTER
(
ctypes
.
c_double
)
*
len
(
mats
))()
else
:
ptr_data
=
(
ctypes
.
POINTER
(
ctypes
.
c_float
)
*
len
(
mats
))()
holders
=
[]
type_ptr_data
=
None
for
i
,
mat
in
enumerate
(
mats
):
if
len
(
mat
.
shape
)
!=
2
:
raise
ValueError
(
'Input numpy.ndarray must be 2 dimensional'
)
if
mat
.
shape
[
1
]
!=
ncol
:
raise
ValueError
(
'Input arrays must have same number of columns'
)
nrow
[
i
]
=
mat
.
shape
[
0
]
if
mat
.
dtype
==
np
.
float32
or
mat
.
dtype
==
np
.
float64
:
mats
[
i
]
=
np
.
array
(
mat
.
reshape
(
mat
.
size
),
dtype
=
mat
.
dtype
,
copy
=
False
)
else
:
# change non-float data to float data, need to copy
mats
[
i
]
=
np
.
array
(
mat
.
reshape
(
mat
.
size
),
dtype
=
np
.
float32
)
chunk_ptr_data
,
chunk_type_ptr_data
,
holder
=
c_float_array
(
mats
[
i
])
if
type_ptr_data
is
not
None
and
chunk_type_ptr_data
!=
type_ptr_data
:
raise
ValueError
(
'Input chunks must have same type'
)
ptr_data
[
i
]
=
chunk_ptr_data
type_ptr_data
=
chunk_type_ptr_data
holders
.
append
(
holder
)
self
.
handle
=
ctypes
.
c_void_p
()
_safe_call
(
_LIB
.
LGBM_DatasetCreateFromMats
(
ctypes
.
c_int
(
len
(
mats
)),
ctypes
.
cast
(
ptr_data
,
ctypes
.
POINTER
(
ctypes
.
POINTER
(
ctypes
.
c_double
))),
ctypes
.
c_int
(
type_ptr_data
),
nrow
.
ctypes
.
data_as
(
ctypes
.
POINTER
(
ctypes
.
c_int32
)),
ctypes
.
c_int
(
ncol
),
ctypes
.
c_int
(
C_API_IS_ROW_MAJOR
),
c_str
(
params_str
),
ref_dataset
,
ctypes
.
byref
(
self
.
handle
)))
def
__init_from_csr
(
self
,
csr
,
params_str
,
ref_dataset
):
def
__init_from_csr
(
self
,
csr
,
params_str
,
ref_dataset
):
"""
"""
Initialize data from a CSR matrix.
Initialize data from a CSR matrix.
...
...
src/c_api.cpp
View file @
5df9584f
...
@@ -475,6 +475,27 @@ int LGBM_DatasetCreateFromMat(const void* data,
...
@@ -475,6 +475,27 @@ int LGBM_DatasetCreateFromMat(const void* data,
const
char
*
parameters
,
const
char
*
parameters
,
const
DatasetHandle
reference
,
const
DatasetHandle
reference
,
DatasetHandle
*
out
)
{
DatasetHandle
*
out
)
{
return
LGBM_DatasetCreateFromMats
(
1
,
&
data
,
data_type
,
&
nrow
,
ncol
,
is_row_major
,
parameters
,
reference
,
out
);
}
int
LGBM_DatasetCreateFromMats
(
int32_t
nmat
,
const
void
**
data
,
int
data_type
,
int32_t
*
nrow
,
int32_t
ncol
,
int
is_row_major
,
const
char
*
parameters
,
const
DatasetHandle
reference
,
DatasetHandle
*
out
)
{
API_BEGIN
();
API_BEGIN
();
auto
param
=
Config
::
Str2Map
(
parameters
);
auto
param
=
Config
::
Str2Map
(
parameters
);
Config
config
;
Config
config
;
...
@@ -483,22 +504,39 @@ int LGBM_DatasetCreateFromMat(const void* data,
...
@@ -483,22 +504,39 @@ int LGBM_DatasetCreateFromMat(const void* data,
omp_set_num_threads
(
config
.
num_threads
);
omp_set_num_threads
(
config
.
num_threads
);
}
}
std
::
unique_ptr
<
Dataset
>
ret
;
std
::
unique_ptr
<
Dataset
>
ret
;
auto
get_row_fun
=
RowFunctionFromDenseMatric
(
data
,
nrow
,
ncol
,
data_type
,
is_row_major
);
int32_t
total_nrow
=
0
;
for
(
int
j
=
0
;
j
<
nmat
;
++
j
)
{
total_nrow
+=
nrow
[
j
];
}
std
::
vector
<
std
::
function
<
std
::
vector
<
double
>
(
int
row_idx
)
>>
get_row_fun
;
for
(
int
j
=
0
;
j
<
nmat
;
++
j
)
{
get_row_fun
.
push_back
(
RowFunctionFromDenseMatric
(
data
[
j
],
nrow
[
j
],
ncol
,
data_type
,
is_row_major
));
}
if
(
reference
==
nullptr
)
{
if
(
reference
==
nullptr
)
{
// sample data first
// sample data first
Random
rand
(
config
.
data_random_seed
);
Random
rand
(
config
.
data_random_seed
);
int
sample_cnt
=
static_cast
<
int
>
(
nrow
<
config
.
bin_construct_sample_cnt
?
nrow
:
config
.
bin_construct_sample_cnt
);
int
sample_cnt
=
static_cast
<
int
>
(
total_
nrow
<
config
.
bin_construct_sample_cnt
?
total_
nrow
:
config
.
bin_construct_sample_cnt
);
auto
sample_indices
=
rand
.
Sample
(
nrow
,
sample_cnt
);
auto
sample_indices
=
rand
.
Sample
(
total_
nrow
,
sample_cnt
);
sample_cnt
=
static_cast
<
int
>
(
sample_indices
.
size
());
sample_cnt
=
static_cast
<
int
>
(
sample_indices
.
size
());
std
::
vector
<
std
::
vector
<
double
>>
sample_values
(
ncol
);
std
::
vector
<
std
::
vector
<
double
>>
sample_values
(
ncol
);
std
::
vector
<
std
::
vector
<
int
>>
sample_idx
(
ncol
);
std
::
vector
<
std
::
vector
<
int
>>
sample_idx
(
ncol
);
int
offset
=
0
;
int
j
=
0
;
for
(
size_t
i
=
0
;
i
<
sample_indices
.
size
();
++
i
)
{
for
(
size_t
i
=
0
;
i
<
sample_indices
.
size
();
++
i
)
{
auto
idx
=
sample_indices
[
i
];
auto
idx
=
sample_indices
[
i
];
auto
row
=
get_row_fun
(
static_cast
<
int
>
(
idx
));
while
((
idx
-
offset
)
>=
nrow
[
j
])
{
for
(
size_t
j
=
0
;
j
<
row
.
size
();
++
j
)
{
offset
+=
nrow
[
j
];
if
(
std
::
fabs
(
row
[
j
])
>
kZeroThreshold
||
std
::
isnan
(
row
[
j
]))
{
++
j
;
sample_values
[
j
].
emplace_back
(
row
[
j
]);
}
sample_idx
[
j
].
emplace_back
(
static_cast
<
int
>
(
i
));
auto
row
=
get_row_fun
[
j
](
static_cast
<
int
>
(
idx
-
offset
));
for
(
size_t
k
=
0
;
k
<
row
.
size
();
++
k
)
{
if
(
std
::
fabs
(
row
[
k
])
>
kZeroThreshold
||
std
::
isnan
(
row
[
k
]))
{
sample_values
[
k
].
emplace_back
(
row
[
k
]);
sample_idx
[
k
].
emplace_back
(
static_cast
<
int
>
(
i
));
}
}
}
}
}
}
...
@@ -507,22 +545,27 @@ int LGBM_DatasetCreateFromMat(const void* data,
...
@@ -507,22 +545,27 @@ int LGBM_DatasetCreateFromMat(const void* data,
Common
::
Vector2Ptr
<
int
>
(
sample_idx
).
data
(),
Common
::
Vector2Ptr
<
int
>
(
sample_idx
).
data
(),
static_cast
<
int
>
(
sample_values
.
size
()),
static_cast
<
int
>
(
sample_values
.
size
()),
Common
::
VectorSize
<
double
>
(
sample_values
).
data
(),
Common
::
VectorSize
<
double
>
(
sample_values
).
data
(),
sample_cnt
,
nrow
));
sample_cnt
,
total_
nrow
));
}
else
{
}
else
{
ret
.
reset
(
new
Dataset
(
nrow
));
ret
.
reset
(
new
Dataset
(
total_
nrow
));
ret
->
CreateValid
(
ret
->
CreateValid
(
reinterpret_cast
<
const
Dataset
*>
(
reference
));
reinterpret_cast
<
const
Dataset
*>
(
reference
));
}
}
int32_t
start_row
=
0
;
for
(
int
j
=
0
;
j
<
nmat
;
++
j
)
{
OMP_INIT_EX
();
OMP_INIT_EX
();
#pragma omp parallel for schedule(static)
#pragma omp parallel for schedule(static)
for
(
int
i
=
0
;
i
<
nrow
;
++
i
)
{
for
(
int
i
=
0
;
i
<
nrow
[
j
]
;
++
i
)
{
OMP_LOOP_EX_BEGIN
();
OMP_LOOP_EX_BEGIN
();
const
int
tid
=
omp_get_thread_num
();
const
int
tid
=
omp_get_thread_num
();
auto
one_row
=
get_row_fun
(
i
);
auto
one_row
=
get_row_fun
[
j
]
(
i
);
ret
->
PushOneRow
(
tid
,
i
,
one_row
);
ret
->
PushOneRow
(
tid
,
start_row
+
i
,
one_row
);
OMP_LOOP_EX_END
();
OMP_LOOP_EX_END
();
}
}
OMP_THROW_EX
();
OMP_THROW_EX
();
start_row
+=
nrow
[
j
];
}
ret
->
FinishLoad
();
ret
->
FinishLoad
();
*
out
=
ret
.
release
();
*
out
=
ret
.
release
();
API_END
();
API_END
();
...
...
tests/python_package_test/test_basic.py
View file @
5df9584f
...
@@ -60,3 +60,16 @@ class TestBasic(unittest.TestCase):
...
@@ -60,3 +60,16 @@ class TestBasic(unittest.TestCase):
for
preds
in
zip
(
pred_early_stopping
,
pred_from_matr
):
for
preds
in
zip
(
pred_early_stopping
,
pred_from_matr
):
# scores likely to be different, but prediction should still be the same
# scores likely to be different, but prediction should still be the same
self
.
assertEqual
(
preds
[
0
]
>
0
,
preds
[
1
]
>
0
)
self
.
assertEqual
(
preds
[
0
]
>
0
,
preds
[
1
]
>
0
)
def
test_chunked_dataset
(
self
):
X_train
,
X_test
,
y_train
,
y_test
=
train_test_split
(
*
load_breast_cancer
(
True
),
test_size
=
0.1
,
random_state
=
2
)
chunk_size
=
X_train
.
shape
[
0
]
//
10
+
1
X_train
=
[
X_train
[
i
*
chunk_size
:(
i
+
1
)
*
chunk_size
,
:]
for
i
in
range
(
X_train
.
shape
[
0
]
//
chunk_size
+
1
)]
X_test
=
[
X_test
[
i
*
chunk_size
:(
i
+
1
)
*
chunk_size
,
:]
for
i
in
range
(
X_test
.
shape
[
0
]
//
chunk_size
+
1
)]
train_data
=
lgb
.
Dataset
(
X_train
,
label
=
y_train
,
params
=
{
"bin_construct_sample_cnt"
:
100
})
valid_data
=
train_data
.
create_valid
(
X_test
,
label
=
y_test
,
params
=
{
"bin_construct_sample_cnt"
:
100
})
train_data
.
construct
()
valid_data
.
construct
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment