Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
tianlh
LightGBM-DCU
Commits
d84582b7
Unverified
Commit
d84582b7
authored
Dec 06, 2023
by
Oliver Borchert
Committed by
GitHub
Dec 06, 2023
Browse files
Fix null handling for Arrow data (#6227)
parent
f5b6bd60
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
16 additions
and
3 deletions
+16
-3
include/LightGBM/arrow.tpp
include/LightGBM/arrow.tpp
+1
-1
tests/cpp_tests/test_arrow.cpp
tests/cpp_tests/test_arrow.cpp
+4
-2
tests/python_package_test/test_arrow.py
tests/python_package_test/test_arrow.py
+11
-0
No files found.
include/LightGBM/arrow.tpp
View file @
d84582b7
...
@@ -144,7 +144,7 @@ struct ArrayIndexAccessor {
...
@@ -144,7 +144,7 @@ struct ArrayIndexAccessor {
// - The structure of validity bitmasks is taken from here:
// - The structure of validity bitmasks is taken from here:
// https://arrow.apache.org/docs/format/Columnar.html#validity-bitmaps
// https://arrow.apache.org/docs/format/Columnar.html#validity-bitmaps
// - If the bitmask is NULL, all indices are valid
// - If the bitmask is NULL, all indices are valid
if
(
validity
==
nullptr
||
!
(
validity
[
buffer_idx
/
8
]
&
(
1
<<
(
buffer_idx
%
8
))))
{
if
(
validity
==
nullptr
||
(
validity
[
buffer_idx
/
8
]
&
(
1
<<
(
buffer_idx
%
8
))))
{
// In case the index is valid, we take it from the data buffer
// In case the index is valid, we take it from the data buffer
auto
data
=
static_cast
<
const
T
*>
(
array
->
buffers
[
1
]);
auto
data
=
static_cast
<
const
T
*>
(
array
->
buffers
[
1
]);
return
static_cast
<
double
>
(
data
[
buffer_idx
]);
return
static_cast
<
double
>
(
data
[
buffer_idx
]);
...
...
tests/cpp_tests/test_arrow.cpp
View file @
d84582b7
...
@@ -41,10 +41,12 @@ class ArrowChunkedArrayTest : public testing::Test {
...
@@ -41,10 +41,12 @@ class ArrowChunkedArrayTest : public testing::Test {
// 1) Create validity bitmap
// 1) Create validity bitmap
char
*
validity
=
nullptr
;
char
*
validity
=
nullptr
;
if
(
!
null_indices
.
empty
())
{
if
(
!
null_indices
.
empty
())
{
validity
=
static_cast
<
char
*>
(
calloc
(
values
.
size
()
+
sizeof
(
char
)
-
1
,
sizeof
(
char
)));
auto
num_bytes
=
(
values
.
size
()
+
7
)
/
8
;
validity
=
static_cast
<
char
*>
(
calloc
(
num_bytes
,
sizeof
(
char
)));
memset
(
validity
,
0xff
,
num_bytes
*
sizeof
(
char
));
for
(
size_t
i
=
0
;
i
<
values
.
size
();
++
i
)
{
for
(
size_t
i
=
0
;
i
<
values
.
size
();
++
i
)
{
if
(
std
::
find
(
null_indices
.
begin
(),
null_indices
.
end
(),
i
)
!=
null_indices
.
end
())
{
if
(
std
::
find
(
null_indices
.
begin
(),
null_indices
.
end
(),
i
)
!=
null_indices
.
end
())
{
validity
[
i
/
8
]
|
=
(
1
<<
(
i
%
8
));
validity
[
i
/
8
]
&
=
~
(
1
<<
(
i
%
8
));
}
}
}
}
}
}
...
...
tests/python_package_test/test_arrow.py
View file @
d84582b7
...
@@ -46,6 +46,16 @@ def generate_simple_arrow_table() -> pa.Table:
...
@@ -46,6 +46,16 @@ def generate_simple_arrow_table() -> pa.Table:
return
pa
.
Table
.
from_arrays
(
columns
,
names
=
[
f
"col_
{
i
}
"
for
i
in
range
(
len
(
columns
))])
return
pa
.
Table
.
from_arrays
(
columns
,
names
=
[
f
"col_
{
i
}
"
for
i
in
range
(
len
(
columns
))])
def
generate_nullable_arrow_table
()
->
pa
.
Table
:
columns
=
[
pa
.
chunked_array
([[
1
,
None
,
3
,
4
,
5
]],
type
=
pa
.
float32
()),
pa
.
chunked_array
([[
None
,
2
,
3
,
4
,
5
]],
type
=
pa
.
float32
()),
pa
.
chunked_array
([[
1
,
2
,
3
,
4
,
None
]],
type
=
pa
.
float32
()),
pa
.
chunked_array
([[
None
,
None
,
None
,
None
,
None
]],
type
=
pa
.
float32
()),
]
return
pa
.
Table
.
from_arrays
(
columns
,
names
=
[
f
"col_
{
i
}
"
for
i
in
range
(
len
(
columns
))])
def
generate_dummy_arrow_table
()
->
pa
.
Table
:
def
generate_dummy_arrow_table
()
->
pa
.
Table
:
col1
=
pa
.
chunked_array
([[
1
,
2
,
3
],
[
4
,
5
]],
type
=
pa
.
uint8
())
col1
=
pa
.
chunked_array
([[
1
,
2
,
3
],
[
4
,
5
]],
type
=
pa
.
uint8
())
col2
=
pa
.
chunked_array
([[
0.5
,
0.6
],
[
0.1
,
0.8
,
1.5
]],
type
=
pa
.
float32
())
col2
=
pa
.
chunked_array
([[
0.5
,
0.6
],
[
0.1
,
0.8
,
1.5
]],
type
=
pa
.
float32
())
...
@@ -95,6 +105,7 @@ def dummy_dataset_params() -> Dict[str, Any]:
...
@@ -95,6 +105,7 @@ def dummy_dataset_params() -> Dict[str, Any]:
[
# Use lambda functions here to minimize memory consumption
[
# Use lambda functions here to minimize memory consumption
(
lambda
:
generate_simple_arrow_table
(),
dummy_dataset_params
()),
(
lambda
:
generate_simple_arrow_table
(),
dummy_dataset_params
()),
(
lambda
:
generate_dummy_arrow_table
(),
dummy_dataset_params
()),
(
lambda
:
generate_dummy_arrow_table
(),
dummy_dataset_params
()),
(
lambda
:
generate_nullable_arrow_table
(),
dummy_dataset_params
()),
(
lambda
:
generate_random_arrow_table
(
3
,
1000
,
42
),
{}),
(
lambda
:
generate_random_arrow_table
(
3
,
1000
,
42
),
{}),
(
lambda
:
generate_random_arrow_table
(
100
,
10000
,
43
),
{}),
(
lambda
:
generate_random_arrow_table
(
100
,
10000
,
43
),
{}),
],
],
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment