Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
tianlh
LightGBM-DCU
Commits
eff287e9
"vscode:/vscode.git/clone" did not exist on "6823af945390ba89b66d7e579501dc87e3cc6162"
Unverified
Commit
eff287e9
authored
Sep 21, 2020
by
Ilya Matiach
Committed by
GitHub
Sep 21, 2020
Browse files
fix sparse multiclass local feature contributions and add test (#3382)
parent
1782fcb1
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
62 additions
and
5 deletions
+62
-5
src/c_api.cpp
src/c_api.cpp
+17
-5
tests/python_package_test/test_engine.py
tests/python_package_test/test_engine.py
+45
-0
No files found.
src/c_api.cpp
View file @
eff287e9
...
...
@@ -515,6 +515,7 @@ class Booster {
out_indices
,
out_data
,
data_type
,
&
is_data_float32
,
num_matrices
);
std
::
vector
<
int
>
row_sizes
(
num_matrices
*
nrow
);
std
::
vector
<
int64_t
>
row_matrix_offsets
(
num_matrices
*
nrow
);
std
::
vector
<
int64_t
>
matrix_offsets
(
num_matrices
);
int64_t
row_vector_cnt
=
0
;
for
(
int
m
=
0
;
m
<
num_matrices
;
++
m
)
{
for
(
int64_t
i
=
0
;
i
<
static_cast
<
int64_t
>
(
agg
.
size
());
++
i
)
{
...
...
@@ -529,6 +530,12 @@ class Booster {
}
row_vector_cnt
++
;
}
if
(
m
==
0
)
{
matrix_offsets
[
m
]
=
0
;
}
if
(
m
+
1
<
num_matrices
)
{
matrix_offsets
[
m
+
1
]
=
static_cast
<
int64_t
>
(
matrix_offsets
[
m
]
+
row_matrix_offsets
[
row_vector_cnt
-
1
]
+
row_sizes
[
row_vector_cnt
-
1
]);
}
}
// copy vector results to output for each row
int64_t
indptr_index
=
0
;
...
...
@@ -546,7 +553,7 @@ class Booster {
OMP_LOOP_EX_BEGIN
();
auto
row_vector
=
agg
[
i
];
int64_t
row_start_index
=
matrix_start_index
+
i
;
int64_t
element_index
=
row_matrix_offsets
[
row_start_index
];
int64_t
element_index
=
row_matrix_offsets
[
row_start_index
]
+
matrix_offsets
[
m
]
;
int64_t
indptr_loop_index
=
indptr_index
+
i
;
for
(
auto
it
=
row_vector
[
m
].
begin
();
it
!=
row_vector
[
m
].
end
();
++
it
)
{
(
*
out_indices
)[
element_index
]
=
it
->
first
;
...
...
@@ -646,13 +653,16 @@ class Booster {
}
else
{
(
reinterpret_cast
<
int64_t
*>
(
*
out_col_ptr
))[
col_ptr_index
]
=
last_column_start_index
+
last_column_size
;
}
if
(
m
!=
0
)
{
matrix_start_indices
[
m
]
=
matrix_start_indices
[
m
-
1
]
+
last_column_start_index
+
last_column_size
;
if
(
m
+
1
<
num_matrices
)
{
matrix_start_indices
[
m
+
1
]
=
matrix_start_indices
[
m
]
+
last_column_start_index
+
last_column_size
;
}
col_ptr_index
++
;
}
// Note: we parallelize across matrices instead of rows because of the column_counts[m][col_idx] increment inside the loop
OMP_INIT_EX
();
#pragma omp parallel for schedule(static)
for
(
int
m
=
0
;
m
<
num_matrices
;
++
m
)
{
OMP_LOOP_EX_BEGIN
();
for
(
int64_t
i
=
0
;
i
<
static_cast
<
int64_t
>
(
agg
.
size
());
++
i
)
{
auto
row_vector
=
agg
[
i
];
for
(
auto
it
=
row_vector
[
m
].
begin
();
it
!=
row_vector
[
m
].
end
();
++
it
)
{
...
...
@@ -671,7 +681,9 @@ class Booster {
}
}
}
OMP_LOOP_EX_END
();
}
OMP_THROW_EX
();
out_len
[
0
]
=
elements_size
;
out_len
[
1
]
=
col_ptr_size
;
}
...
...
tests/python_package_test/test_engine.py
View file @
eff287e9
...
...
@@ -1034,6 +1034,51 @@ class TestEngine(unittest.TestCase):
# validate the values are the same
np
.
testing
.
assert_allclose
(
contribs_csc
.
toarray
(),
contribs_dense
)
def
test_contribs_sparse_multiclass
(
self
):
n_features
=
20
n_samples
=
100
n_labels
=
4
# generate CSR sparse dataset
X
,
y
=
make_multilabel_classification
(
n_samples
=
n_samples
,
sparse
=
True
,
n_features
=
n_features
,
n_classes
=
1
,
n_labels
=
n_labels
)
y
=
y
.
flatten
()
X_train
,
X_test
,
y_train
,
y_test
=
train_test_split
(
X
,
y
,
test_size
=
0.1
,
random_state
=
42
)
params
=
{
'objective'
:
'multiclass'
,
'num_class'
:
n_labels
,
'verbose'
:
-
1
,
}
lgb_train
=
lgb
.
Dataset
(
X_train
,
y_train
)
gbm
=
lgb
.
train
(
params
,
lgb_train
,
num_boost_round
=
20
)
contribs_csr
=
gbm
.
predict
(
X_test
,
pred_contrib
=
True
)
self
.
assertTrue
(
isinstance
(
contribs_csr
,
list
))
for
perclass_contribs_csr
in
contribs_csr
:
self
.
assertTrue
(
isspmatrix_csr
(
perclass_contribs_csr
))
# convert data to dense and get back same contribs
contribs_dense
=
gbm
.
predict
(
X_test
.
toarray
(),
pred_contrib
=
True
)
# validate the values are the same
contribs_csr_array
=
np
.
swapaxes
(
np
.
array
([
sparse_array
.
todense
()
for
sparse_array
in
contribs_csr
]),
0
,
1
)
contribs_csr_arr_re
=
contribs_csr_array
.
reshape
((
contribs_csr_array
.
shape
[
0
],
contribs_csr_array
.
shape
[
1
]
*
contribs_csr_array
.
shape
[
2
]))
np
.
testing
.
assert_allclose
(
contribs_csr_arr_re
,
contribs_dense
)
contribs_dense_re
=
contribs_dense
.
reshape
(
contribs_csr_array
.
shape
)
self
.
assertLess
(
np
.
linalg
.
norm
(
gbm
.
predict
(
X_test
,
raw_score
=
True
)
-
np
.
sum
(
contribs_dense_re
,
axis
=
2
)),
1e-4
)
# validate using CSC matrix
X_test_csc
=
X_test
.
tocsc
()
contribs_csc
=
gbm
.
predict
(
X_test_csc
,
pred_contrib
=
True
)
self
.
assertTrue
(
isinstance
(
contribs_csc
,
list
))
for
perclass_contribs_csc
in
contribs_csc
:
self
.
assertTrue
(
isspmatrix_csc
(
perclass_contribs_csc
))
# validate the values are the same
contribs_csc_array
=
np
.
swapaxes
(
np
.
array
([
sparse_array
.
todense
()
for
sparse_array
in
contribs_csc
]),
0
,
1
)
contribs_csc_array
=
contribs_csc_array
.
reshape
((
contribs_csc_array
.
shape
[
0
],
contribs_csc_array
.
shape
[
1
]
*
contribs_csc_array
.
shape
[
2
]))
np
.
testing
.
assert_allclose
(
contribs_csc_array
,
contribs_dense
)
@
unittest
.
skipIf
(
psutil
.
virtual_memory
().
available
/
1024
/
1024
/
1024
<
3
,
'not enough RAM'
)
def
test_int32_max_sparse_contribs
(
self
):
params
=
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment