Unverified Commit f5b6bd60 authored by Oliver Borchert's avatar Oliver Borchert Committed by GitHub
Browse files

[python-package] Allow to pass Arrow table and array as init scores (#6167)

parent 5083df15
...@@ -559,9 +559,10 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetSetField(DatasetHandle handle, ...@@ -559,9 +559,10 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetSetField(DatasetHandle handle,
* \brief Set vector to a content in info. * \brief Set vector to a content in info.
* \note * \note
* - \a group converts input datatype into ``int32``; * - \a group converts input datatype into ``int32``;
* - \a label and \a weight convert input datatype into ``float32``. * - \a label and \a weight convert input datatype into ``float32``;
* - \a init_score converts input datatype into ``float64``.
* \param handle Handle of dataset * \param handle Handle of dataset
* \param field_name Field name, can be \a label, \a weight, \a group * \param field_name Field name, can be \a label, \a weight, \a init_score, \a group
* \param n_chunks The number of Arrow arrays passed to this function * \param n_chunks The number of Arrow arrays passed to this function
* \param chunks Pointer to the list of Arrow arrays * \param chunks Pointer to the list of Arrow arrays
* \param schema Pointer to the schema of all Arrow arrays * \param schema Pointer to the schema of all Arrow arrays
......
...@@ -125,6 +125,7 @@ class Metadata { ...@@ -125,6 +125,7 @@ class Metadata {
* \param init_score Initial scores, this class will manage memory for init_score. * \param init_score Initial scores, this class will manage memory for init_score.
*/ */
void SetInitScore(const double* init_score, data_size_t len); void SetInitScore(const double* init_score, data_size_t len);
void SetInitScore(const ArrowChunkedArray& array);
/*! /*!
...@@ -347,6 +348,9 @@ class Metadata { ...@@ -347,6 +348,9 @@ class Metadata {
void SetWeightsFromIterator(It first, It last); void SetWeightsFromIterator(It first, It last);
/*! \brief Insert initial scores at the given index */ /*! \brief Insert initial scores at the given index */
void InsertInitScores(const double* init_scores, data_size_t start_index, data_size_t len, data_size_t source_size); void InsertInitScores(const double* init_scores, data_size_t start_index, data_size_t len, data_size_t source_size);
/*! \brief Set init scores from pointers to the first element and the end of an iterator. */
template <typename It>
void SetInitScoresFromIterator(It first, It last);
/*! \brief Insert queries at the given index */ /*! \brief Insert queries at the given index */
void InsertQueries(const data_size_t* queries, data_size_t start_index, data_size_t len); void InsertQueries(const data_size_t* queries, data_size_t start_index, data_size_t len);
/*! \brief Set queries from pointers to the first element and the end of an iterator. */ /*! \brief Set queries from pointers to the first element and the end of an iterator. */
......
...@@ -19,8 +19,8 @@ import numpy as np ...@@ -19,8 +19,8 @@ import numpy as np
import scipy.sparse import scipy.sparse
from .compat import (PANDAS_INSTALLED, PYARROW_INSTALLED, arrow_cffi, arrow_is_floating, arrow_is_integer, concat, from .compat import (PANDAS_INSTALLED, PYARROW_INSTALLED, arrow_cffi, arrow_is_floating, arrow_is_integer, concat,
dt_DataTable, pa_Array, pa_ChunkedArray, pa_compute, pa_Table, pd_CategoricalDtype, pd_DataFrame, dt_DataTable, pa_Array, pa_chunked_array, pa_ChunkedArray, pa_compute, pa_Table,
pd_Series) pd_CategoricalDtype, pd_DataFrame, pd_Series)
from .libpath import find_lib_path from .libpath import find_lib_path
if TYPE_CHECKING: if TYPE_CHECKING:
...@@ -84,6 +84,9 @@ _LGBM_InitScoreType = Union[ ...@@ -84,6 +84,9 @@ _LGBM_InitScoreType = Union[
np.ndarray, np.ndarray,
pd_Series, pd_Series,
pd_DataFrame, pd_DataFrame,
pa_Table,
pa_Array,
pa_ChunkedArray,
] ]
_LGBM_TrainDataType = Union[ _LGBM_TrainDataType = Union[
str, str,
...@@ -1660,7 +1663,7 @@ class Dataset: ...@@ -1660,7 +1663,7 @@ class Dataset:
sum(group) = n_samples. sum(group) = n_samples.
For example, if you have a 100-document dataset with ``group = [10, 20, 40, 10, 10, 10]``, that means that you have 6 groups, For example, if you have a 100-document dataset with ``group = [10, 20, 40, 10, 10, 10]``, that means that you have 6 groups,
where the first 10 records are in the first group, records 11-30 are in the second group, records 31-70 are in the third group, etc. where the first 10 records are in the first group, records 11-30 are in the second group, records 31-70 are in the third group, etc.
init_score : list, list of lists (for multi-class task), numpy array, pandas Series, pandas DataFrame (for multi-class task), or None, optional (default=None) init_score : list, list of lists (for multi-class task), numpy array, pandas Series, pandas DataFrame (for multi-class task), pyarrow Array, pyarrow ChunkedArray, pyarrow Table (for multi-class task) or None, optional (default=None)
Init score for Dataset. Init score for Dataset.
feature_name : list of str, or 'auto', optional (default="auto") feature_name : list of str, or 'auto', optional (default="auto")
Feature names. Feature names.
...@@ -2440,7 +2443,7 @@ class Dataset: ...@@ -2440,7 +2443,7 @@ class Dataset:
sum(group) = n_samples. sum(group) = n_samples.
For example, if you have a 100-document dataset with ``group = [10, 20, 40, 10, 10, 10]``, that means that you have 6 groups, For example, if you have a 100-document dataset with ``group = [10, 20, 40, 10, 10, 10]``, that means that you have 6 groups,
where the first 10 records are in the first group, records 11-30 are in the second group, records 31-70 are in the third group, etc. where the first 10 records are in the first group, records 11-30 are in the second group, records 31-70 are in the third group, etc.
init_score : list, list of lists (for multi-class task), numpy array, pandas Series, pandas DataFrame (for multi-class task), or None, optional (default=None) init_score : list, list of lists (for multi-class task), numpy array, pandas Series, pandas DataFrame (for multi-class task), pyarrow Array, pyarrow ChunkedArray, pyarrow Table (for multi-class task) or None, optional (default=None)
Init score for Dataset. Init score for Dataset.
params : dict or None, optional (default=None) params : dict or None, optional (default=None)
Other parameters for validation Dataset. Other parameters for validation Dataset.
...@@ -2547,7 +2550,7 @@ class Dataset: ...@@ -2547,7 +2550,7 @@ class Dataset:
def set_field( def set_field(
self, self,
field_name: str, field_name: str,
data: Optional[Union[List[List[float]], List[List[int]], List[float], List[int], np.ndarray, pd_Series, pd_DataFrame, pa_Array, pa_ChunkedArray]] data: Optional[Union[List[List[float]], List[List[int]], List[float], List[int], np.ndarray, pd_Series, pd_DataFrame, pa_Table, pa_Array, pa_ChunkedArray]]
) -> "Dataset": ) -> "Dataset":
"""Set property into the Dataset. """Set property into the Dataset.
...@@ -2576,7 +2579,16 @@ class Dataset: ...@@ -2576,7 +2579,16 @@ class Dataset:
return self return self
# If the data is a arrow data, we can just pass it to C # If the data is a arrow data, we can just pass it to C
if _is_pyarrow_array(data): if _is_pyarrow_array(data) or _is_pyarrow_table(data):
# If a table is being passed, we concatenate the columns. This is only valid for
# 'init_score'.
if _is_pyarrow_table(data):
if field_name != "init_score":
raise ValueError(f"pyarrow tables are not supported for field '{field_name}'")
data = pa_chunked_array([
chunk for array in data.columns for chunk in array.chunks # type: ignore
])
c_array = _export_arrow_to_c(data) c_array = _export_arrow_to_c(data)
_safe_call(_LIB.LGBM_DatasetSetFieldFromArrow( _safe_call(_LIB.LGBM_DatasetSetFieldFromArrow(
self._handle, self._handle,
...@@ -2869,7 +2881,7 @@ class Dataset: ...@@ -2869,7 +2881,7 @@ class Dataset:
Parameters Parameters
---------- ----------
init_score : list, list of lists (for multi-class task), numpy array, pandas Series, pandas DataFrame (for multi-class task), or None init_score : list, list of lists (for multi-class task), numpy array, pandas Series, pandas DataFrame (for multi-class task), pyarrow Array, pyarrow ChunkedArray, pyarrow Table (for multi-class task) or None
Init score for Booster. Init score for Booster.
Returns Returns
...@@ -4443,7 +4455,7 @@ class Booster: ...@@ -4443,7 +4455,7 @@ class Booster:
.. versionadded:: 4.0.0 .. versionadded:: 4.0.0
init_score : list, list of lists (for multi-class task), numpy array, pandas Series, pandas DataFrame (for multi-class task), or None, optional (default=None) init_score : list, list of lists (for multi-class task), numpy array, pandas Series, pandas DataFrame (for multi-class task), pyarrow Array, pyarrow ChunkedArray, pyarrow Table (for multi-class task) or None, optional (default=None)
Init score for ``data``. Init score for ``data``.
.. versionadded:: 4.0.0 .. versionadded:: 4.0.0
......
...@@ -201,6 +201,7 @@ try: ...@@ -201,6 +201,7 @@ try:
from pyarrow import Array as pa_Array from pyarrow import Array as pa_Array
from pyarrow import ChunkedArray as pa_ChunkedArray from pyarrow import ChunkedArray as pa_ChunkedArray
from pyarrow import Table as pa_Table from pyarrow import Table as pa_Table
from pyarrow import chunked_array as pa_chunked_array
from pyarrow.cffi import ffi as arrow_cffi from pyarrow.cffi import ffi as arrow_cffi
from pyarrow.types import is_floating as arrow_is_floating from pyarrow.types import is_floating as arrow_is_floating
from pyarrow.types import is_integer as arrow_is_integer from pyarrow.types import is_integer as arrow_is_integer
...@@ -243,6 +244,7 @@ except ImportError: ...@@ -243,6 +244,7 @@ except ImportError:
all = None all = None
equal = None equal = None
pa_chunked_array = None
arrow_is_integer = None arrow_is_integer = None
arrow_is_floating = None arrow_is_floating = None
......
...@@ -904,6 +904,8 @@ bool Dataset::SetFieldFromArrow(const char* field_name, const ArrowChunkedArray ...@@ -904,6 +904,8 @@ bool Dataset::SetFieldFromArrow(const char* field_name, const ArrowChunkedArray
metadata_.SetLabel(ca); metadata_.SetLabel(ca);
} else if (name == std::string("weight") || name == std::string("weights")) { } else if (name == std::string("weight") || name == std::string("weights")) {
metadata_.SetWeights(ca); metadata_.SetWeights(ca);
} else if (name == std::string("init_score")) {
metadata_.SetInitScore(ca);
} else if (name == std::string("query") || name == std::string("group")) { } else if (name == std::string("query") || name == std::string("group")) {
metadata_.SetQuery(ca); metadata_.SetQuery(ca);
} else { } else {
......
...@@ -355,32 +355,44 @@ void Metadata::CheckOrPartition(data_size_t num_all_data, const std::vector<data ...@@ -355,32 +355,44 @@ void Metadata::CheckOrPartition(data_size_t num_all_data, const std::vector<data
} }
} }
void Metadata::SetInitScore(const double* init_score, data_size_t len) { template <typename It>
void Metadata::SetInitScoresFromIterator(It first, It last) {
std::lock_guard<std::mutex> lock(mutex_); std::lock_guard<std::mutex> lock(mutex_);
// save to nullptr // Clear init scores on empty input
if (init_score == nullptr || len == 0) { if (last - first == 0) {
init_score_.clear(); init_score_.clear();
num_init_score_ = 0; num_init_score_ = 0;
return; return;
} }
if ((len % num_data_) != 0) { if (((last - first) % num_data_) != 0) {
Log::Fatal("Initial score size doesn't match data size"); Log::Fatal("Initial score size doesn't match data size");
} }
if (init_score_.empty()) { init_score_.resize(len); } if (init_score_.empty()) {
num_init_score_ = len; init_score_.resize(last - first);
}
num_init_score_ = last - first;
#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 512) if (num_init_score_ >= 1024) #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 512) if (num_init_score_ >= 1024)
for (int64_t i = 0; i < num_init_score_; ++i) { for (int64_t i = 0; i < num_init_score_; ++i) {
init_score_[i] = Common::AvoidInf(init_score[i]); init_score_[i] = Common::AvoidInf(first[i]);
} }
init_score_load_from_file_ = false; init_score_load_from_file_ = false;
#ifdef USE_CUDA #ifdef USE_CUDA
if (cuda_metadata_ != nullptr) { if (cuda_metadata_ != nullptr) {
cuda_metadata_->SetInitScore(init_score_.data(), len); cuda_metadata_->SetInitScore(init_score_.data(), init_score_.size());
} }
#endif // USE_CUDA #endif // USE_CUDA
} }
void Metadata::SetInitScore(const double* init_score, data_size_t len) {
SetInitScoresFromIterator(init_score, init_score + len);
}
void Metadata::SetInitScore(const ArrowChunkedArray& array) {
SetInitScoresFromIterator(array.begin<double>(), array.end<double>());
}
void Metadata::InsertInitScores(const double* init_scores, data_size_t start_index, data_size_t len, data_size_t source_size) { void Metadata::InsertInitScores(const double* init_scores, data_size_t start_index, data_size_t len, data_size_t source_size) {
if (num_init_score_ <= 0) { if (num_init_score_ <= 0) {
Log::Fatal("Inserting initial score data into dataset with no initial scores"); Log::Fatal("Inserting initial score data into dataset with no initial scores");
......
...@@ -178,7 +178,7 @@ def test_dataset_construct_weights_none(): ...@@ -178,7 +178,7 @@ def test_dataset_construct_weights_none():
["array_type", "weight_data"], ["array_type", "weight_data"],
[(pa.array, [3, 0.7, 1.5, 0.5, 0.1]), (pa.chunked_array, [[3], [0.7, 1.5, 0.5, 0.1]])], [(pa.array, [3, 0.7, 1.5, 0.5, 0.1]), (pa.chunked_array, [[3], [0.7, 1.5, 0.5, 0.1]])],
) )
@pytest.mark.parametrize("arrow_type", [pa.float32(), pa.float64()]) @pytest.mark.parametrize("arrow_type", _FLOAT_TYPES)
def test_dataset_construct_weights(array_type, weight_data, arrow_type): def test_dataset_construct_weights(array_type, weight_data, arrow_type):
data = generate_dummy_arrow_table() data = generate_dummy_arrow_table()
weights = array_type(weight_data, type=arrow_type) weights = array_type(weight_data, type=arrow_type)
...@@ -210,3 +210,46 @@ def test_dataset_construct_groups(array_type, group_data, arrow_type): ...@@ -210,3 +210,46 @@ def test_dataset_construct_groups(array_type, group_data, arrow_type):
expected = np.array([0, 2, 5], dtype=np.int32) expected = np.array([0, 2, 5], dtype=np.int32)
np_assert_array_equal(expected, dataset.get_field("group"), strict=True) np_assert_array_equal(expected, dataset.get_field("group"), strict=True)
# ----------------------------------------- INIT SCORES ----------------------------------------- #
@pytest.mark.parametrize(
["array_type", "init_score_data"],
[
(pa.array, [0, 1, 2, 3, 3]),
(pa.chunked_array, [[0, 1, 2], [3, 3]]),
(pa.chunked_array, [[], [0, 1, 2], [3, 3]]),
(pa.chunked_array, [[0, 1], [], [], [2], [3, 3], []]),
],
)
@pytest.mark.parametrize("arrow_type", _INTEGER_TYPES + _FLOAT_TYPES)
def test_dataset_construct_init_scores_array(
array_type: Any, init_score_data: Any, arrow_type: Any
):
data = generate_dummy_arrow_table()
init_scores = array_type(init_score_data, type=arrow_type)
dataset = lgb.Dataset(data, init_score=init_scores, params=dummy_dataset_params())
dataset.construct()
expected = np.array([0, 1, 2, 3, 3], dtype=np.float64)
np_assert_array_equal(expected, dataset.get_init_score(), strict=True)
def test_dataset_construct_init_scores_table():
data = generate_dummy_arrow_table()
init_scores = pa.Table.from_arrays(
[
generate_random_arrow_array(5, seed=1),
generate_random_arrow_array(5, seed=2),
generate_random_arrow_array(5, seed=3),
],
names=["a", "b", "c"],
)
dataset = lgb.Dataset(data, init_score=init_scores, params=dummy_dataset_params())
dataset.construct()
actual = dataset.get_init_score()
expected = init_scores.to_pandas().to_numpy().astype(np.float64)
np_assert_array_equal(expected, actual, strict=True)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment