[python-package] Allow to pass Arrow table and array as init scores (#6167)

f5b6bd60 · Oliver Borchert · GitHub · 5083df15 · f5b6bd60 · f5b6bd60
Unverified Commit f5b6bd60 authored Dec 04, 2023 by Oliver Borchert Committed by GitHub Dec 04, 2023
7 changed files
--- a/include/LightGBM/c_api.h
+++ b/include/LightGBM/c_api.h
@@ -559,9 +559,10 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetSetField(DatasetHandle handle,
 * \brief Set vector to a content in info.
 * \note
 * - \a group converts input datatype into ``int32``;
- * - \a label and \a weight convert input datatype into ``float32``.
+ * - \a label and \a weight convert input datatype into ``float32``;
+ * - \a init_score converts input datatype into ``float64``.
 * \param handle Handle of dataset
- * \param field_name Field name, can be \a label, \a weight, \a group
+ * \param field_name Field name, can be \a label, \a weight, \a init_score, \a group
 * \param n_chunks The number of Arrow arrays passed to this function
 * \param chunks Pointer to the list of Arrow arrays
 * \param schema Pointer to the schema of all Arrow arrays

--- a/include/LightGBM/dataset.h
+++ b/include/LightGBM/dataset.h
@@ -125,6 +125,7 @@ class Metadata {
  * \param init_score Initial scores, this class will manage memory for init_score.
  */
  void SetInitScore(const double* init_score, data_size_t len);
+  void SetInitScore(const ArrowChunkedArray& array);


  /*!
@@ -347,6 +348,9 @@ class Metadata {
  void SetWeightsFromIterator(It first, It last);
  /*! \brief Insert initial scores at the given index */
  void InsertInitScores(const double* init_scores, data_size_t start_index, data_size_t len, data_size_t source_size);
+  /*! \brief Set init scores from pointers to the first element and the end of an iterator. */
+  template <typename It>
+  void SetInitScoresFromIterator(It first, It last);
  /*! \brief Insert queries at the given index */
  void InsertQueries(const data_size_t* queries, data_size_t start_index, data_size_t len);
  /*! \brief Set queries from pointers to the first element and the end of an iterator. */

--- a/python-package/lightgbm/basic.py
+++ b/python-package/lightgbm/basic.py
@@ -19,8 +19,8 @@ import numpy as np
 import scipy.sparse

 from .compat import (PANDAS_INSTALLED, PYARROW_INSTALLED, arrow_cffi, arrow_is_floating, arrow_is_integer, concat,
-                     dt_DataTable, pa_Array, pa_ChunkedArray, pa_compute, pa_Table, pd_CategoricalDtype, pd_DataFrame,
-                     pd_Series)
+                     dt_DataTable, pa_Array, pa_chunked_array, pa_ChunkedArray, pa_compute, pa_Table,
+                     pd_CategoricalDtype, pd_DataFrame, pd_Series)
 from .libpath import find_lib_path

 if TYPE_CHECKING:
@@ -84,6 +84,9 @@ _LGBM_InitScoreType = Union[
    np.ndarray,
    pd_Series,
    pd_DataFrame,
+    pa_Table,
+    pa_Array,
+    pa_ChunkedArray,
 ]
 _LGBM_TrainDataType = Union[
    str,
@@ -1660,7 +1663,7 @@ class Dataset:
            sum(group) = n_samples.
            For example, if you have a 100-document dataset with ``group = [10, 20, 40, 10, 10, 10]``, that means that you have 6 groups,
            where the first 10 records are in the first group, records 11-30 are in the second group, records 31-70 are in the third group, etc.
-        init_score : list, list of lists (for multi-class task), numpy array, pandas Series, pandas DataFrame (for multi-class task), or None, optional (default=None)
+        init_score : list, list of lists (for multi-class task), numpy array, pandas Series, pandas DataFrame (for multi-class task), pyarrow Array, pyarrow ChunkedArray, pyarrow Table (for multi-class task) or None, optional (default=None)
            Init score for Dataset.
        feature_name : list of str, or 'auto', optional (default="auto")
            Feature names.
@@ -2440,7 +2443,7 @@ class Dataset:
            sum(group) = n_samples.
            For example, if you have a 100-document dataset with ``group = [10, 20, 40, 10, 10, 10]``, that means that you have 6 groups,
            where the first 10 records are in the first group, records 11-30 are in the second group, records 31-70 are in the third group, etc.
-        init_score : list, list of lists (for multi-class task), numpy array, pandas Series, pandas DataFrame (for multi-class task), or None, optional (default=None)
+        init_score : list, list of lists (for multi-class task), numpy array, pandas Series, pandas DataFrame (for multi-class task), pyarrow Array, pyarrow ChunkedArray, pyarrow Table (for multi-class task) or None, optional (default=None)
            Init score for Dataset.
        params : dict or None, optional (default=None)
            Other parameters for validation Dataset.
@@ -2547,7 +2550,7 @@ class Dataset:
    def set_field(
        self,
        field_name: str,
-        data: Optional[Union[List[List[float]], List[List[int]], List[float], List[int], np.ndarray, pd_Series, pd_DataFrame, pa_Array, pa_ChunkedArray]]
+        data: Optional[Union[List[List[float]], List[List[int]], List[float], List[int], np.ndarray, pd_Series, pd_DataFrame, pa_Table, pa_Array, pa_ChunkedArray]]
    ) -> "Dataset":
        """Set property into the Dataset.

@@ -2576,7 +2579,16 @@ class Dataset:
            return self

        # If the data is a arrow data, we can just pass it to C
-        if _is_pyarrow_array(data):
+        if _is_pyarrow_array(data) or _is_pyarrow_table(data):
+            # If a table is being passed, we concatenate the columns. This is only valid for
+            # 'init_score'.
+            if _is_pyarrow_table(data):
+                if field_name != "init_score":
+                    raise ValueError(f"pyarrow tables are not supported for field '{field_name}'")
+                data = pa_chunked_array([
+                    chunk for array in data.columns for chunk in array.chunks  # type: ignore
+                ])
+
            c_array = _export_arrow_to_c(data)
            _safe_call(_LIB.LGBM_DatasetSetFieldFromArrow(
                self._handle,
@@ -2869,7 +2881,7 @@ class Dataset:

        Parameters
        ----------
-        init_score : list, list of lists (for multi-class task), numpy array, pandas Series, pandas DataFrame (for multi-class task), or None
+        init_score : list, list of lists (for multi-class task), numpy array, pandas Series, pandas DataFrame (for multi-class task), pyarrow Array, pyarrow ChunkedArray, pyarrow Table (for multi-class task) or None
            Init score for Booster.

        Returns
@@ -4443,7 +4455,7 @@ class Booster:

            .. versionadded:: 4.0.0

-        init_score : list, list of lists (for multi-class task), numpy array, pandas Series, pandas DataFrame (for multi-class task), or None, optional (default=None)
+        init_score : list, list of lists (for multi-class task), numpy array, pandas Series, pandas DataFrame (for multi-class task), pyarrow Array, pyarrow ChunkedArray, pyarrow Table (for multi-class task) or None, optional (default=None)
            Init score for ``data``.

            .. versionadded:: 4.0.0

--- a/python-package/lightgbm/compat.py
+++ b/python-package/lightgbm/compat.py
@@ -201,6 +201,7 @@ try:
    from pyarrow import Array as pa_Array
    from pyarrow import ChunkedArray as pa_ChunkedArray
    from pyarrow import Table as pa_Table
+    from pyarrow import chunked_array as pa_chunked_array
    from pyarrow.cffi import ffi as arrow_cffi
    from pyarrow.types import is_floating as arrow_is_floating
    from pyarrow.types import is_integer as arrow_is_integer
@@ -243,6 +244,7 @@ except ImportError:
        all = None
        equal = None

+    pa_chunked_array = None
    arrow_is_integer = None
    arrow_is_floating = None


--- a/src/io/dataset.cpp
+++ b/src/io/dataset.cpp
@@ -904,6 +904,8 @@ bool Dataset::SetFieldFromArrow(const char* field_name, const ArrowChunkedArray
    metadata_.SetLabel(ca);
  } else if (name == std::string("weight") || name == std::string("weights")) {
    metadata_.SetWeights(ca);
+  } else if (name == std::string("init_score")) {
+    metadata_.SetInitScore(ca);
  } else if (name == std::string("query") || name == std::string("group")) {
    metadata_.SetQuery(ca);
  } else {

--- a/src/io/metadata.cpp
+++ b/src/io/metadata.cpp
@@ -355,32 +355,44 @@ void Metadata::CheckOrPartition(data_size_t num_all_data, const std::vector<data
  }
 }

-void Metadata::SetInitScore(const double* init_score, data_size_t len) {
+template <typename It>
+void Metadata::SetInitScoresFromIterator(It first, It last) {
  std::lock_guard<std::mutex> lock(mutex_);
-  // save to nullptr
-  if (init_score == nullptr || len == 0) {
+  // Clear init scores on empty input
+  if (last - first == 0) {
    init_score_.clear();
    num_init_score_ = 0;
    return;
  }
-  if ((len % num_data_) != 0) {
+  if (((last - first) % num_data_) != 0) {
    Log::Fatal("Initial score size doesn't match data size");
  }
-  if (init_score_.empty()) { init_score_.resize(len); }
-  num_init_score_ = len;
+  if (init_score_.empty()) {
+    init_score_.resize(last - first);
+  }
+  num_init_score_ = last - first;

  #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 512) if (num_init_score_ >= 1024)
  for (int64_t i = 0; i < num_init_score_; ++i) {
-    init_score_[i] = Common::AvoidInf(init_score[i]);
+    init_score_[i] = Common::AvoidInf(first[i]);
  }
  init_score_load_from_file_ = false;
+
  #ifdef USE_CUDA
  if (cuda_metadata_ != nullptr) {
-    cuda_metadata_->SetInitScore(init_score_.data(), len);
+    cuda_metadata_->SetInitScore(init_score_.data(), init_score_.size());
  }
  #endif  // USE_CUDA
 }

+void Metadata::SetInitScore(const double* init_score, data_size_t len) {
+  SetInitScoresFromIterator(init_score, init_score + len);
+}
+
+void Metadata::SetInitScore(const ArrowChunkedArray& array) {
+  SetInitScoresFromIterator(array.begin<double>(), array.end<double>());
+}
+
 void Metadata::InsertInitScores(const double* init_scores, data_size_t start_index, data_size_t len, data_size_t source_size) {
  if (num_init_score_ <= 0) {
    Log::Fatal("Inserting initial score data into dataset with no initial scores");

--- a/tests/python_package_test/test_arrow.py
+++ b/tests/python_package_test/test_arrow.py
@@ -178,7 +178,7 @@ def test_dataset_construct_weights_none():
    ["array_type", "weight_data"],
    [(pa.array, [3, 0.7, 1.5, 0.5, 0.1]), (pa.chunked_array, [[3], [0.7, 1.5, 0.5, 0.1]])],
 )
-@pytest.mark.parametrize("arrow_type", [pa.float32(), pa.float64()])
+@pytest.mark.parametrize("arrow_type", _FLOAT_TYPES)
 def test_dataset_construct_weights(array_type, weight_data, arrow_type):
    data = generate_dummy_arrow_table()
    weights = array_type(weight_data, type=arrow_type)
@@ -210,3 +210,46 @@ def test_dataset_construct_groups(array_type, group_data, arrow_type):

    expected = np.array([0, 2, 5], dtype=np.int32)
    np_assert_array_equal(expected, dataset.get_field("group"), strict=True)
+
+
+# ----------------------------------------- INIT SCORES ----------------------------------------- #
+
+
+@pytest.mark.parametrize(
+    ["array_type", "init_score_data"],
+    [
+        (pa.array, [0, 1, 2, 3, 3]),
+        (pa.chunked_array, [[0, 1, 2], [3, 3]]),
+        (pa.chunked_array, [[], [0, 1, 2], [3, 3]]),
+        (pa.chunked_array, [[0, 1], [], [], [2], [3, 3], []]),
+    ],
+)
+@pytest.mark.parametrize("arrow_type", _INTEGER_TYPES + _FLOAT_TYPES)
+def test_dataset_construct_init_scores_array(
+    array_type: Any, init_score_data: Any, arrow_type: Any
+):
+    data = generate_dummy_arrow_table()
+    init_scores = array_type(init_score_data, type=arrow_type)
+    dataset = lgb.Dataset(data, init_score=init_scores, params=dummy_dataset_params())
+    dataset.construct()
+
+    expected = np.array([0, 1, 2, 3, 3], dtype=np.float64)
+    np_assert_array_equal(expected, dataset.get_init_score(), strict=True)
+
+
+def test_dataset_construct_init_scores_table():
+    data = generate_dummy_arrow_table()
+    init_scores = pa.Table.from_arrays(
+        [
+            generate_random_arrow_array(5, seed=1),
+            generate_random_arrow_array(5, seed=2),
+            generate_random_arrow_array(5, seed=3),
+        ],
+        names=["a", "b", "c"],
+    )
+    dataset = lgb.Dataset(data, init_score=init_scores, params=dummy_dataset_params())
+    dataset.construct()
+
+    actual = dataset.get_init_score()
+    expected = init_scores.to_pandas().to_numpy().astype(np.float64)
+    np_assert_array_equal(expected, actual, strict=True)