Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
tianlh
LightGBM-DCU
Commits
eff287e9
Unverified
Commit
eff287e9
authored
Sep 21, 2020
by
Ilya Matiach
Committed by
GitHub
Sep 21, 2020
Browse files
fix sparse multiclass local feature contributions and add test (#3382)
parent
1782fcb1
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
62 additions
and
5 deletions
+62
-5
src/c_api.cpp
src/c_api.cpp
+17
-5
tests/python_package_test/test_engine.py
tests/python_package_test/test_engine.py
+45
-0
No files found.
src/c_api.cpp
View file @
eff287e9
...
@@ -515,6 +515,7 @@ class Booster {
...
@@ -515,6 +515,7 @@ class Booster {
out_indices
,
out_data
,
data_type
,
&
is_data_float32
,
num_matrices
);
out_indices
,
out_data
,
data_type
,
&
is_data_float32
,
num_matrices
);
std
::
vector
<
int
>
row_sizes
(
num_matrices
*
nrow
);
std
::
vector
<
int
>
row_sizes
(
num_matrices
*
nrow
);
std
::
vector
<
int64_t
>
row_matrix_offsets
(
num_matrices
*
nrow
);
std
::
vector
<
int64_t
>
row_matrix_offsets
(
num_matrices
*
nrow
);
std
::
vector
<
int64_t
>
matrix_offsets
(
num_matrices
);
int64_t
row_vector_cnt
=
0
;
int64_t
row_vector_cnt
=
0
;
for
(
int
m
=
0
;
m
<
num_matrices
;
++
m
)
{
for
(
int
m
=
0
;
m
<
num_matrices
;
++
m
)
{
for
(
int64_t
i
=
0
;
i
<
static_cast
<
int64_t
>
(
agg
.
size
());
++
i
)
{
for
(
int64_t
i
=
0
;
i
<
static_cast
<
int64_t
>
(
agg
.
size
());
++
i
)
{
...
@@ -529,6 +530,12 @@ class Booster {
...
@@ -529,6 +530,12 @@ class Booster {
}
}
row_vector_cnt
++
;
row_vector_cnt
++
;
}
}
if
(
m
==
0
)
{
matrix_offsets
[
m
]
=
0
;
}
if
(
m
+
1
<
num_matrices
)
{
matrix_offsets
[
m
+
1
]
=
static_cast
<
int64_t
>
(
matrix_offsets
[
m
]
+
row_matrix_offsets
[
row_vector_cnt
-
1
]
+
row_sizes
[
row_vector_cnt
-
1
]);
}
}
}
// copy vector results to output for each row
// copy vector results to output for each row
int64_t
indptr_index
=
0
;
int64_t
indptr_index
=
0
;
...
@@ -546,7 +553,7 @@ class Booster {
...
@@ -546,7 +553,7 @@ class Booster {
OMP_LOOP_EX_BEGIN
();
OMP_LOOP_EX_BEGIN
();
auto
row_vector
=
agg
[
i
];
auto
row_vector
=
agg
[
i
];
int64_t
row_start_index
=
matrix_start_index
+
i
;
int64_t
row_start_index
=
matrix_start_index
+
i
;
int64_t
element_index
=
row_matrix_offsets
[
row_start_index
];
int64_t
element_index
=
row_matrix_offsets
[
row_start_index
]
+
matrix_offsets
[
m
]
;
int64_t
indptr_loop_index
=
indptr_index
+
i
;
int64_t
indptr_loop_index
=
indptr_index
+
i
;
for
(
auto
it
=
row_vector
[
m
].
begin
();
it
!=
row_vector
[
m
].
end
();
++
it
)
{
for
(
auto
it
=
row_vector
[
m
].
begin
();
it
!=
row_vector
[
m
].
end
();
++
it
)
{
(
*
out_indices
)[
element_index
]
=
it
->
first
;
(
*
out_indices
)[
element_index
]
=
it
->
first
;
...
@@ -646,13 +653,16 @@ class Booster {
...
@@ -646,13 +653,16 @@ class Booster {
}
else
{
}
else
{
(
reinterpret_cast
<
int64_t
*>
(
*
out_col_ptr
))[
col_ptr_index
]
=
last_column_start_index
+
last_column_size
;
(
reinterpret_cast
<
int64_t
*>
(
*
out_col_ptr
))[
col_ptr_index
]
=
last_column_start_index
+
last_column_size
;
}
}
if
(
m
!=
0
)
{
if
(
m
+
1
<
num_matrices
)
{
matrix_start_indices
[
m
]
=
matrix_start_indices
[
m
-
1
]
+
matrix_start_indices
[
m
+
1
]
=
matrix_start_indices
[
m
]
+
last_column_start_index
+
last_column_size
;
last_column_start_index
+
last_column_size
;
}
}
col_ptr_index
++
;
}
}
// Note: we parallelize across matrices instead of rows because of the column_counts[m][col_idx] increment inside the loop
OMP_INIT_EX
();
#pragma omp parallel for schedule(static)
for
(
int
m
=
0
;
m
<
num_matrices
;
++
m
)
{
for
(
int
m
=
0
;
m
<
num_matrices
;
++
m
)
{
OMP_LOOP_EX_BEGIN
();
for
(
int64_t
i
=
0
;
i
<
static_cast
<
int64_t
>
(
agg
.
size
());
++
i
)
{
for
(
int64_t
i
=
0
;
i
<
static_cast
<
int64_t
>
(
agg
.
size
());
++
i
)
{
auto
row_vector
=
agg
[
i
];
auto
row_vector
=
agg
[
i
];
for
(
auto
it
=
row_vector
[
m
].
begin
();
it
!=
row_vector
[
m
].
end
();
++
it
)
{
for
(
auto
it
=
row_vector
[
m
].
begin
();
it
!=
row_vector
[
m
].
end
();
++
it
)
{
...
@@ -671,7 +681,9 @@ class Booster {
...
@@ -671,7 +681,9 @@ class Booster {
}
}
}
}
}
}
OMP_LOOP_EX_END
();
}
}
OMP_THROW_EX
();
out_len
[
0
]
=
elements_size
;
out_len
[
0
]
=
elements_size
;
out_len
[
1
]
=
col_ptr_size
;
out_len
[
1
]
=
col_ptr_size
;
}
}
...
...
tests/python_package_test/test_engine.py
View file @
eff287e9
...
@@ -1034,6 +1034,51 @@ class TestEngine(unittest.TestCase):
...
@@ -1034,6 +1034,51 @@ class TestEngine(unittest.TestCase):
# validate the values are the same
# validate the values are the same
np
.
testing
.
assert_allclose
(
contribs_csc
.
toarray
(),
contribs_dense
)
np
.
testing
.
assert_allclose
(
contribs_csc
.
toarray
(),
contribs_dense
)
def
test_contribs_sparse_multiclass
(
self
):
n_features
=
20
n_samples
=
100
n_labels
=
4
# generate CSR sparse dataset
X
,
y
=
make_multilabel_classification
(
n_samples
=
n_samples
,
sparse
=
True
,
n_features
=
n_features
,
n_classes
=
1
,
n_labels
=
n_labels
)
y
=
y
.
flatten
()
X_train
,
X_test
,
y_train
,
y_test
=
train_test_split
(
X
,
y
,
test_size
=
0.1
,
random_state
=
42
)
params
=
{
'objective'
:
'multiclass'
,
'num_class'
:
n_labels
,
'verbose'
:
-
1
,
}
lgb_train
=
lgb
.
Dataset
(
X_train
,
y_train
)
gbm
=
lgb
.
train
(
params
,
lgb_train
,
num_boost_round
=
20
)
contribs_csr
=
gbm
.
predict
(
X_test
,
pred_contrib
=
True
)
self
.
assertTrue
(
isinstance
(
contribs_csr
,
list
))
for
perclass_contribs_csr
in
contribs_csr
:
self
.
assertTrue
(
isspmatrix_csr
(
perclass_contribs_csr
))
# convert data to dense and get back same contribs
contribs_dense
=
gbm
.
predict
(
X_test
.
toarray
(),
pred_contrib
=
True
)
# validate the values are the same
contribs_csr_array
=
np
.
swapaxes
(
np
.
array
([
sparse_array
.
todense
()
for
sparse_array
in
contribs_csr
]),
0
,
1
)
contribs_csr_arr_re
=
contribs_csr_array
.
reshape
((
contribs_csr_array
.
shape
[
0
],
contribs_csr_array
.
shape
[
1
]
*
contribs_csr_array
.
shape
[
2
]))
np
.
testing
.
assert_allclose
(
contribs_csr_arr_re
,
contribs_dense
)
contribs_dense_re
=
contribs_dense
.
reshape
(
contribs_csr_array
.
shape
)
self
.
assertLess
(
np
.
linalg
.
norm
(
gbm
.
predict
(
X_test
,
raw_score
=
True
)
-
np
.
sum
(
contribs_dense_re
,
axis
=
2
)),
1e-4
)
# validate using CSC matrix
X_test_csc
=
X_test
.
tocsc
()
contribs_csc
=
gbm
.
predict
(
X_test_csc
,
pred_contrib
=
True
)
self
.
assertTrue
(
isinstance
(
contribs_csc
,
list
))
for
perclass_contribs_csc
in
contribs_csc
:
self
.
assertTrue
(
isspmatrix_csc
(
perclass_contribs_csc
))
# validate the values are the same
contribs_csc_array
=
np
.
swapaxes
(
np
.
array
([
sparse_array
.
todense
()
for
sparse_array
in
contribs_csc
]),
0
,
1
)
contribs_csc_array
=
contribs_csc_array
.
reshape
((
contribs_csc_array
.
shape
[
0
],
contribs_csc_array
.
shape
[
1
]
*
contribs_csc_array
.
shape
[
2
]))
np
.
testing
.
assert_allclose
(
contribs_csc_array
,
contribs_dense
)
@
unittest
.
skipIf
(
psutil
.
virtual_memory
().
available
/
1024
/
1024
/
1024
<
3
,
'not enough RAM'
)
@
unittest
.
skipIf
(
psutil
.
virtual_memory
().
available
/
1024
/
1024
/
1024
<
3
,
'not enough RAM'
)
def
test_int32_max_sparse_contribs
(
self
):
def
test_int32_max_sparse_contribs
(
self
):
params
=
{
params
=
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment