Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
tianlh
LightGBM-DCU
Commits
f5b6bd60
Unverified
Commit
f5b6bd60
authored
Dec 04, 2023
by
Oliver Borchert
Committed by
GitHub
Dec 04, 2023
Browse files
[python-package] Allow to pass Arrow table and array as init scores (#6167)
parent
5083df15
Changes
7
Show whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
95 additions
and
19 deletions
+95
-19
include/LightGBM/c_api.h
include/LightGBM/c_api.h
+3
-2
include/LightGBM/dataset.h
include/LightGBM/dataset.h
+4
-0
python-package/lightgbm/basic.py
python-package/lightgbm/basic.py
+20
-8
python-package/lightgbm/compat.py
python-package/lightgbm/compat.py
+2
-0
src/io/dataset.cpp
src/io/dataset.cpp
+2
-0
src/io/metadata.cpp
src/io/metadata.cpp
+20
-8
tests/python_package_test/test_arrow.py
tests/python_package_test/test_arrow.py
+44
-1
No files found.
include/LightGBM/c_api.h
View file @
f5b6bd60
...
...
@@ -559,9 +559,10 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetSetField(DatasetHandle handle,
* \brief Set vector to a content in info.
* \note
* - \a group converts input datatype into ``int32``;
* - \a label and \a weight convert input datatype into ``float32``.
* - \a label and \a weight convert input datatype into ``float32``;
* - \a init_score converts input datatype into ``float64``.
* \param handle Handle of dataset
* \param field_name Field name, can be \a label, \a weight, \a group
* \param field_name Field name, can be \a label, \a weight, \a
init_score, \a
group
* \param n_chunks The number of Arrow arrays passed to this function
* \param chunks Pointer to the list of Arrow arrays
* \param schema Pointer to the schema of all Arrow arrays
...
...
include/LightGBM/dataset.h
View file @
f5b6bd60
...
...
@@ -125,6 +125,7 @@ class Metadata {
* \param init_score Initial scores, this class will manage memory for init_score.
*/
void
SetInitScore
(
const
double
*
init_score
,
data_size_t
len
);
void
SetInitScore
(
const
ArrowChunkedArray
&
array
);
/*!
...
...
@@ -347,6 +348,9 @@ class Metadata {
void
SetWeightsFromIterator
(
It
first
,
It
last
);
/*! \brief Insert initial scores at the given index */
void
InsertInitScores
(
const
double
*
init_scores
,
data_size_t
start_index
,
data_size_t
len
,
data_size_t
source_size
);
/*! \brief Set init scores from pointers to the first element and the end of an iterator. */
template
<
typename
It
>
void
SetInitScoresFromIterator
(
It
first
,
It
last
);
/*! \brief Insert queries at the given index */
void
InsertQueries
(
const
data_size_t
*
queries
,
data_size_t
start_index
,
data_size_t
len
);
/*! \brief Set queries from pointers to the first element and the end of an iterator. */
...
...
python-package/lightgbm/basic.py
View file @
f5b6bd60
...
...
@@ -19,8 +19,8 @@ import numpy as np
import
scipy.sparse
from
.compat
import
(
PANDAS_INSTALLED
,
PYARROW_INSTALLED
,
arrow_cffi
,
arrow_is_floating
,
arrow_is_integer
,
concat
,
dt_DataTable
,
pa_Array
,
pa_ChunkedArray
,
pa_compute
,
pa_Table
,
pd_CategoricalDtype
,
pd_DataFrame
,
pd_Series
)
dt_DataTable
,
pa_Array
,
pa_chunked_array
,
pa_ChunkedArray
,
pa_compute
,
pa_Table
,
pd_CategoricalDtype
,
pd_DataFrame
,
pd_Series
)
from
.libpath
import
find_lib_path
if
TYPE_CHECKING
:
...
...
@@ -84,6 +84,9 @@ _LGBM_InitScoreType = Union[
np
.
ndarray
,
pd_Series
,
pd_DataFrame
,
pa_Table
,
pa_Array
,
pa_ChunkedArray
,
]
_LGBM_TrainDataType
=
Union
[
str
,
...
...
@@ -1660,7 +1663,7 @@ class Dataset:
sum(group) = n_samples.
For example, if you have a 100-document dataset with ``group = [10, 20, 40, 10, 10, 10]``, that means that you have 6 groups,
where the first 10 records are in the first group, records 11-30 are in the second group, records 31-70 are in the third group, etc.
init_score : list, list of lists (for multi-class task), numpy array, pandas Series, pandas DataFrame (for multi-class task), or None, optional (default=None)
init_score : list, list of lists (for multi-class task), numpy array, pandas Series, pandas DataFrame (for multi-class task),
pyarrow Array, pyarrow ChunkedArray, pyarrow Table (for multi-class task)
or None, optional (default=None)
Init score for Dataset.
feature_name : list of str, or 'auto', optional (default="auto")
Feature names.
...
...
@@ -2440,7 +2443,7 @@ class Dataset:
sum(group) = n_samples.
For example, if you have a 100-document dataset with ``group = [10, 20, 40, 10, 10, 10]``, that means that you have 6 groups,
where the first 10 records are in the first group, records 11-30 are in the second group, records 31-70 are in the third group, etc.
init_score : list, list of lists (for multi-class task), numpy array, pandas Series, pandas DataFrame (for multi-class task), or None, optional (default=None)
init_score : list, list of lists (for multi-class task), numpy array, pandas Series, pandas DataFrame (for multi-class task),
pyarrow Array, pyarrow ChunkedArray, pyarrow Table (for multi-class task)
or None, optional (default=None)
Init score for Dataset.
params : dict or None, optional (default=None)
Other parameters for validation Dataset.
...
...
@@ -2547,7 +2550,7 @@ class Dataset:
def
set_field
(
self
,
field_name
:
str
,
data
:
Optional
[
Union
[
List
[
List
[
float
]],
List
[
List
[
int
]],
List
[
float
],
List
[
int
],
np
.
ndarray
,
pd_Series
,
pd_DataFrame
,
pa_Array
,
pa_ChunkedArray
]]
data
:
Optional
[
Union
[
List
[
List
[
float
]],
List
[
List
[
int
]],
List
[
float
],
List
[
int
],
np
.
ndarray
,
pd_Series
,
pd_DataFrame
,
pa_Table
,
pa_Array
,
pa_ChunkedArray
]]
)
->
"Dataset"
:
"""Set property into the Dataset.
...
...
@@ -2576,7 +2579,16 @@ class Dataset:
return
self
# If the data is a arrow data, we can just pass it to C
if
_is_pyarrow_array
(
data
):
if
_is_pyarrow_array
(
data
)
or
_is_pyarrow_table
(
data
):
# If a table is being passed, we concatenate the columns. This is only valid for
# 'init_score'.
if
_is_pyarrow_table
(
data
):
if
field_name
!=
"init_score"
:
raise
ValueError
(
f
"pyarrow tables are not supported for field '
{
field_name
}
'"
)
data
=
pa_chunked_array
([
chunk
for
array
in
data
.
columns
for
chunk
in
array
.
chunks
# type: ignore
])
c_array
=
_export_arrow_to_c
(
data
)
_safe_call
(
_LIB
.
LGBM_DatasetSetFieldFromArrow
(
self
.
_handle
,
...
...
@@ -2869,7 +2881,7 @@ class Dataset:
Parameters
----------
init_score : list, list of lists (for multi-class task), numpy array, pandas Series, pandas DataFrame (for multi-class task), or None
init_score : list, list of lists (for multi-class task), numpy array, pandas Series, pandas DataFrame (for multi-class task),
pyarrow Array, pyarrow ChunkedArray, pyarrow Table (for multi-class task)
or None
Init score for Booster.
Returns
...
...
@@ -4443,7 +4455,7 @@ class Booster:
.. versionadded:: 4.0.0
init_score : list, list of lists (for multi-class task), numpy array, pandas Series, pandas DataFrame (for multi-class task), or None, optional (default=None)
init_score : list, list of lists (for multi-class task), numpy array, pandas Series, pandas DataFrame (for multi-class task),
pyarrow Array, pyarrow ChunkedArray, pyarrow Table (for multi-class task)
or None, optional (default=None)
Init score for ``data``.
.. versionadded:: 4.0.0
...
...
python-package/lightgbm/compat.py
View file @
f5b6bd60
...
...
@@ -201,6 +201,7 @@ try:
from
pyarrow
import
Array
as
pa_Array
from
pyarrow
import
ChunkedArray
as
pa_ChunkedArray
from
pyarrow
import
Table
as
pa_Table
from
pyarrow
import
chunked_array
as
pa_chunked_array
from
pyarrow.cffi
import
ffi
as
arrow_cffi
from
pyarrow.types
import
is_floating
as
arrow_is_floating
from
pyarrow.types
import
is_integer
as
arrow_is_integer
...
...
@@ -243,6 +244,7 @@ except ImportError:
all
=
None
equal
=
None
pa_chunked_array
=
None
arrow_is_integer
=
None
arrow_is_floating
=
None
...
...
src/io/dataset.cpp
View file @
f5b6bd60
...
...
@@ -904,6 +904,8 @@ bool Dataset::SetFieldFromArrow(const char* field_name, const ArrowChunkedArray
metadata_
.
SetLabel
(
ca
);
}
else
if
(
name
==
std
::
string
(
"weight"
)
||
name
==
std
::
string
(
"weights"
))
{
metadata_
.
SetWeights
(
ca
);
}
else
if
(
name
==
std
::
string
(
"init_score"
))
{
metadata_
.
SetInitScore
(
ca
);
}
else
if
(
name
==
std
::
string
(
"query"
)
||
name
==
std
::
string
(
"group"
))
{
metadata_
.
SetQuery
(
ca
);
}
else
{
...
...
src/io/metadata.cpp
View file @
f5b6bd60
...
...
@@ -355,32 +355,44 @@ void Metadata::CheckOrPartition(data_size_t num_all_data, const std::vector<data
}
}
void
Metadata
::
SetInitScore
(
const
double
*
init_score
,
data_size_t
len
)
{
template
<
typename
It
>
void
Metadata
::
SetInitScoresFromIterator
(
It
first
,
It
last
)
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
mutex_
);
//
save to nullptr
if
(
init_score
==
nullptr
||
len
==
0
)
{
//
Clear init scores on empty input
if
(
last
-
first
==
0
)
{
init_score_
.
clear
();
num_init_score_
=
0
;
return
;
}
if
((
len
%
num_data_
)
!=
0
)
{
if
((
(
last
-
first
)
%
num_data_
)
!=
0
)
{
Log
::
Fatal
(
"Initial score size doesn't match data size"
);
}
if
(
init_score_
.
empty
())
{
init_score_
.
resize
(
len
);
}
num_init_score_
=
len
;
if
(
init_score_
.
empty
())
{
init_score_
.
resize
(
last
-
first
);
}
num_init_score_
=
last
-
first
;
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 512) if (num_init_score_ >= 1024)
for
(
int64_t
i
=
0
;
i
<
num_init_score_
;
++
i
)
{
init_score_
[
i
]
=
Common
::
AvoidInf
(
init_score
[
i
]);
init_score_
[
i
]
=
Common
::
AvoidInf
(
first
[
i
]);
}
init_score_load_from_file_
=
false
;
#ifdef USE_CUDA
if
(
cuda_metadata_
!=
nullptr
)
{
cuda_metadata_
->
SetInitScore
(
init_score_
.
data
(),
len
);
cuda_metadata_
->
SetInitScore
(
init_score_
.
data
(),
init_score_
.
size
()
);
}
#endif // USE_CUDA
}
void
Metadata
::
SetInitScore
(
const
double
*
init_score
,
data_size_t
len
)
{
SetInitScoresFromIterator
(
init_score
,
init_score
+
len
);
}
void
Metadata
::
SetInitScore
(
const
ArrowChunkedArray
&
array
)
{
SetInitScoresFromIterator
(
array
.
begin
<
double
>
(),
array
.
end
<
double
>
());
}
void
Metadata
::
InsertInitScores
(
const
double
*
init_scores
,
data_size_t
start_index
,
data_size_t
len
,
data_size_t
source_size
)
{
if
(
num_init_score_
<=
0
)
{
Log
::
Fatal
(
"Inserting initial score data into dataset with no initial scores"
);
...
...
tests/python_package_test/test_arrow.py
View file @
f5b6bd60
...
...
@@ -178,7 +178,7 @@ def test_dataset_construct_weights_none():
[
"array_type"
,
"weight_data"
],
[(
pa
.
array
,
[
3
,
0.7
,
1.5
,
0.5
,
0.1
]),
(
pa
.
chunked_array
,
[[
3
],
[
0.7
,
1.5
,
0.5
,
0.1
]])],
)
@
pytest
.
mark
.
parametrize
(
"arrow_type"
,
[
pa
.
float32
(),
pa
.
float64
()]
)
@
pytest
.
mark
.
parametrize
(
"arrow_type"
,
_FLOAT_TYPES
)
def
test_dataset_construct_weights
(
array_type
,
weight_data
,
arrow_type
):
data
=
generate_dummy_arrow_table
()
weights
=
array_type
(
weight_data
,
type
=
arrow_type
)
...
...
@@ -210,3 +210,46 @@ def test_dataset_construct_groups(array_type, group_data, arrow_type):
expected
=
np
.
array
([
0
,
2
,
5
],
dtype
=
np
.
int32
)
np_assert_array_equal
(
expected
,
dataset
.
get_field
(
"group"
),
strict
=
True
)
# ----------------------------------------- INIT SCORES ----------------------------------------- #
@
pytest
.
mark
.
parametrize
(
[
"array_type"
,
"init_score_data"
],
[
(
pa
.
array
,
[
0
,
1
,
2
,
3
,
3
]),
(
pa
.
chunked_array
,
[[
0
,
1
,
2
],
[
3
,
3
]]),
(
pa
.
chunked_array
,
[[],
[
0
,
1
,
2
],
[
3
,
3
]]),
(
pa
.
chunked_array
,
[[
0
,
1
],
[],
[],
[
2
],
[
3
,
3
],
[]]),
],
)
@
pytest
.
mark
.
parametrize
(
"arrow_type"
,
_INTEGER_TYPES
+
_FLOAT_TYPES
)
def
test_dataset_construct_init_scores_array
(
array_type
:
Any
,
init_score_data
:
Any
,
arrow_type
:
Any
):
data
=
generate_dummy_arrow_table
()
init_scores
=
array_type
(
init_score_data
,
type
=
arrow_type
)
dataset
=
lgb
.
Dataset
(
data
,
init_score
=
init_scores
,
params
=
dummy_dataset_params
())
dataset
.
construct
()
expected
=
np
.
array
([
0
,
1
,
2
,
3
,
3
],
dtype
=
np
.
float64
)
np_assert_array_equal
(
expected
,
dataset
.
get_init_score
(),
strict
=
True
)
def
test_dataset_construct_init_scores_table
():
data
=
generate_dummy_arrow_table
()
init_scores
=
pa
.
Table
.
from_arrays
(
[
generate_random_arrow_array
(
5
,
seed
=
1
),
generate_random_arrow_array
(
5
,
seed
=
2
),
generate_random_arrow_array
(
5
,
seed
=
3
),
],
names
=
[
"a"
,
"b"
,
"c"
],
)
dataset
=
lgb
.
Dataset
(
data
,
init_score
=
init_scores
,
params
=
dummy_dataset_params
())
dataset
.
construct
()
actual
=
dataset
.
get_init_score
()
expected
=
init_scores
.
to_pandas
().
to_numpy
().
astype
(
np
.
float64
)
np_assert_array_equal
(
expected
,
actual
,
strict
=
True
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment