[python] fixes for supporting 2d numpy arrays for predictions, grads and hess...

[python] fixes for supporting 2d numpy arrays for predictions, grads and hess in multiclass custom objective and eval (#5030) * fixes for supporting 2d numpy arrays for predictions, grads and hess in multiclass custom objective * Apply suggestions from code review Co-authored-by: José Morales <jmoralz92@gmail.com> Co-authored-by: José Morales <jmoralz92@gmail.com>

[python] fixes for supporting 2d numpy arrays for predictions, grads and hess...
[python] fixes for supporting 2d numpy arrays for predictions, grads and hess in multiclass custom objective and eval (#5030) * fixes for supporting 2d numpy arrays for predictions, grads and hess in multiclass custom objective * Apply suggestions from code review Co-authored-by: José Morales <jmoralz92@gmail.com> Co-authored-by: José Morales <jmoralz92@gmail.com>
56313661 · Nikita Titov · GitHub · 7e478047 · 56313661 · 56313661
Unverified Commit 56313661 authored Feb 27, 2022 by Nikita Titov Committed by GitHub Feb 27, 2022
4 changed files
--- a/python-package/lightgbm/basic.py
+++ b/python-package/lightgbm/basic.py
@@ -2752,7 +2752,7 @@ class Booster:
            - ``missing_direction`` : str, split direction that missing values should go to. ``None`` for leaf nodes.
            - ``missing_type`` : str, describes what types of values are treated as missing.
            - ``value`` : float64, predicted value for this leaf node, multiplied by the learning rate.
-            - ``weight`` : float64 or int64, sum of hessian (second-order derivative of objective), summed over observations that fall in this node.
+            - ``weight`` : float64 or int64, sum of Hessian (second-order derivative of objective), summed over observations that fall in this node.
            - ``count`` : int64, number of records in the training data that fall into this node.

        Returns
@@ -2961,7 +2961,7 @@ class Booster:
                    The value of the second order derivative (Hessian) of the loss
                    with respect to the elements of preds for each sample point.

-            For multi-class task, preds are a [n_samples, n_classes] numpy 2-D array,
+            For multi-class task, preds are numpy 2-D array of shape = [n_samples, n_classes],
            and grad and hess should be returned in the same format.

        Returns
@@ -3000,9 +3000,6 @@ class Booster:
            if not self.__set_objective_to_none:
                self.reset_parameter({"objective": "none"}).__set_objective_to_none = True
            grad, hess = fobj(self.__inner_predict(0), self.train_set)
-            if self.num_model_per_iteration() > 1:
-                grad = grad.ravel(order='F')
-                hess = hess.ravel(order='F')
            return self.__boost(grad, hess)

    def __boost(self, grad, hess):
@@ -3012,7 +3009,7 @@ class Booster:

            Score is returned before any transformation,
            e.g. it is raw margin instead of probability of positive class for binary task.
-            For multi-class task, preds are a [n_samples, n_classes] numpy 2-D array,
+            For multi-class task, score are numpy 2-D array of shape = [n_samples, n_classes],
            and grad and hess should be returned in the same format.

        Parameters
@@ -3029,6 +3026,9 @@ class Booster:
        is_finished : bool
            Whether the boost was successfully finished.
        """
+        if self.__num_class > 1:
+            grad = grad.ravel(order='F')
+            hess = hess.ravel(order='F')
        grad = list_to_1d_numpy(grad, name='gradient')
        hess = list_to_1d_numpy(hess, name='hessian')
        assert grad.flags.c_contiguous
@@ -3036,12 +3036,11 @@ class Booster:
        if len(grad) != len(hess):
            raise ValueError(f"Lengths of gradient ({len(grad)}) and Hessian ({len(hess)}) don't match")
        num_train_data = self.train_set.num_data()
-        num_models = self.__num_class
-        if len(grad) != num_train_data * num_models:
+        if len(grad) != num_train_data * self.__num_class:
            raise ValueError(
                f"Lengths of gradient ({len(grad)}) and Hessian ({len(hess)}) "
                f"don't match training data length ({num_train_data}) * "
-                f"number of models per one iteration ({num_models})"
+                f"number of models per one iteration ({self.__num_class})"
            )
        is_finished = ctypes.c_int(0)
        _safe_call(_LIB.LGBM_BoosterUpdateOneIterCustom(
@@ -3149,8 +3148,9 @@ class Booster:
            Should accept two parameters: preds, eval_data,
            and return (eval_name, eval_result, is_higher_better) or list of such tuples.

-                preds : numpy 1-D array
+                preds : numpy 1-D array or numpy 2-D array (for multi-class task)
                    The predicted values.
+                    For multi-class task, preds are numpy 2-D array of shape = [n_samples, n_classes].
                    If ``fobj`` is specified, predicted values are returned before any transformation,
                    e.g. they are raw margin instead of probability of positive class for binary task in this case.
                eval_data : Dataset
@@ -3162,9 +3162,6 @@ class Booster:
                is_higher_better : bool
                    Is eval result higher better, e.g. AUC is ``is_higher_better``.

-            For multi-class task, preds are a [n_samples, n_classes] numpy 2-D array,
-            and grad and hess should be returned in the same format.
-
        Returns
        -------
        result : list
@@ -3199,6 +3196,7 @@ class Booster:

                preds : numpy 1-D array or numpy 2-D array (for multi-class task)
                    The predicted values.
+                    For multi-class task, preds are numpy 2-D array of shape = [n_samples, n_classes].
                    If ``fobj`` is specified, predicted values are returned before any transformation,
                    e.g. they are raw margin instead of probability of positive class for binary task in this case.
                eval_data : Dataset
@@ -3210,9 +3208,6 @@ class Booster:
                is_higher_better : bool
                    Is eval result higher better, e.g. AUC is ``is_higher_better``.

-            For multi-class task, preds are a [n_samples, n_classes] numpy 2-D array,
-            and grad and hess should be returned in the same format.
-
        Returns
        -------
        result : list
@@ -3232,6 +3227,7 @@ class Booster:

                preds : numpy 1-D array or numpy 2-D array (for multi-class task)
                    The predicted values.
+                    For multi-class task, preds are numpy 2-D array of shape = [n_samples, n_classes].
                    If ``fobj`` is specified, predicted values are returned before any transformation,
                    e.g. they are raw margin instead of probability of positive class for binary task in this case.
                eval_data : Dataset
@@ -3243,9 +3239,6 @@ class Booster:
                is_higher_better : bool
                    Is eval result higher better, e.g. AUC is ``is_higher_better``.

-            For multi-class task, preds are a [n_samples, n_classes] numpy 2-D array,
-            and grad and hess should be returned in the same format.
-
        Returns
        -------
        result : list

--- a/python-package/lightgbm/engine.py
+++ b/python-package/lightgbm/engine.py
@@ -9,13 +9,12 @@ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 import numpy as np

 from . import callback
-from .basic import (Booster, Dataset, LightGBMError, _ArrayLike, _choose_param_value, _ConfigAliases, _InnerPredictor,
-                    _log_warning)
+from .basic import Booster, Dataset, LightGBMError, _choose_param_value, _ConfigAliases, _InnerPredictor, _log_warning
 from .compat import SKLEARN_INSTALLED, _LGBMGroupKFold, _LGBMStratifiedKFold

 _LGBM_CustomObjectiveFunction = Callable[
    [np.ndarray, Dataset],
-    Tuple[_ArrayLike, _ArrayLike]
+    Tuple[np.ndarray, np.ndarray]
 ]
 _LGBM_CustomMetricFunction = Callable[
    [np.ndarray, Dataset],
@@ -56,30 +55,30 @@ def train(
        Should accept two parameters: preds, train_data,
        and return (grad, hess).

-            preds : numpy 1-D array
+            preds : numpy 1-D array or numpy 2-D array (for multi-class task)
                The predicted values.
                Predicted values are returned before any transformation,
                e.g. they are raw margin instead of probability of positive class for binary task.
            train_data : Dataset
                The training dataset.
-            grad : list, numpy 1-D array or pandas Series
+            grad : numpy 1-D array or numpy 2-D array (for multi-class task)
                The value of the first order derivative (gradient) of the loss
                with respect to the elements of preds for each sample point.
-            hess : list, numpy 1-D array or pandas Series
+            hess : numpy 1-D array or numpy 2-D array (for multi-class task)
                The value of the second order derivative (Hessian) of the loss
                with respect to the elements of preds for each sample point.

-        For multi-class task, the preds is group by class_id first, then group by row_id.
-        If you want to get i-th row preds in j-th class, the access way is score[j * num_data + i]
-        and you should group grad and hess in this way as well.
+        For multi-class task, preds are numpy 2-D array of shape = [n_samples, n_classes],
+        and grad and hess should be returned in the same format.

    feval : callable, list of callable, or None, optional (default=None)
        Customized evaluation function.
        Each evaluation function should accept two parameters: preds, eval_data,
        and return (eval_name, eval_result, is_higher_better) or list of such tuples.

-            preds : numpy 1-D array
+            preds : numpy 1-D array or numpy 2-D array (for multi-class task)
                The predicted values.
+                For multi-class task, preds are numpy 2-D array of shape = [n_samples, n_classes].
                If ``fobj`` is specified, predicted values are returned before any transformation,
                e.g. they are raw margin instead of probability of positive class for binary task in this case.
            eval_data : Dataset
@@ -91,8 +90,6 @@ def train(
            is_higher_better : bool
                Is eval result higher better, e.g. AUC is ``is_higher_better``.

-        For multi-class task, the preds is group by class_id first, then group by row_id.
-        If you want to get i-th row preds in j-th class, the access way is preds[j * num_data + i].
        To ignore the default metric corresponding to the used objective,
        set the ``metric`` parameter to the string ``"None"`` in ``params``.
    init_model : str, pathlib.Path, Booster or None, optional (default=None)
@@ -411,30 +408,30 @@ def cv(params, train_set, num_boost_round=100,
        Should accept two parameters: preds, train_data,
        and return (grad, hess).

-            preds : numpy 1-D array
+            preds : numpy 1-D array or numpy 2-D array (for multi-class task)
                The predicted values.
                Predicted values are returned before any transformation,
                e.g. they are raw margin instead of probability of positive class for binary task.
            train_data : Dataset
                The training dataset.
-            grad : list, numpy 1-D array or pandas Series
+            grad : numpy 1-D array or numpy 2-D array (for multi-class task)
                The value of the first order derivative (gradient) of the loss
                with respect to the elements of preds for each sample point.
-            hess : list, numpy 1-D array or pandas Series
+            hess : numpy 1-D array or numpy 2-D array (for multi-class task)
                The value of the second order derivative (Hessian) of the loss
                with respect to the elements of preds for each sample point.

-        For multi-class task, the preds is group by class_id first, then group by row_id.
-        If you want to get i-th row preds in j-th class, the access way is score[j * num_data + i]
-        and you should group grad and hess in this way as well.
+        For multi-class task, preds are numpy 2-D array of shape = [n_samples, n_classes],
+        and grad and hess should be returned in the same format.

    feval : callable, list of callable, or None, optional (default=None)
        Customized evaluation function.
        Each evaluation function should accept two parameters: preds, eval_data,
        and return (eval_name, eval_result, is_higher_better) or list of such tuples.

-            preds : numpy 1-D array
+            preds : numpy 1-D array or numpy 2-D array (for multi-class task)
                The predicted values.
+                For multi-class task, preds are numpy 2-D array of shape = [n_samples, n_classes].
                If ``fobj`` is specified, predicted values are returned before any transformation,
                e.g. they are raw margin instead of probability of positive class for binary task in this case.
            eval_data : Dataset
@@ -446,8 +443,6 @@ def cv(params, train_set, num_boost_round=100,
            is_higher_better : bool
                Is eval result higher better, e.g. AUC is ``is_higher_better``.

-        For multi-class task, the preds is group by class_id first, then group by row_id.
-        If you want to get i-th row preds in j-th class, the access way is preds[j * num_data + i].
        To ignore the default metric corresponding to the used objective,
        set ``metrics`` to the string ``"None"``.
    init_model : str, pathlib.Path, Booster or None, optional (default=None)

--- a/python-package/lightgbm/plotting.py
+++ b/python-package/lightgbm/plotting.py
@@ -556,7 +556,7 @@ def create_tree_digraph(
            - ``'internal_count'`` : number of records from the training data that fall into this non-leaf node
            - ``'internal_weight'`` : total weight of all nodes that fall into this non-leaf node
            - ``'leaf_count'`` : number of records from the training data that fall into this leaf node
-            - ``'leaf_weight'`` : total weight (sum of hessian) of all observations that fall into this leaf node
+            - ``'leaf_weight'`` : total weight (sum of Hessian) of all observations that fall into this leaf node
            - ``'data_percentage'`` : percentage of training data that fall into this node
    precision : int or None, optional (default=3)
        Used to restrict the display of floating point values to a certain precision.
@@ -649,7 +649,7 @@ def plot_tree(
            - ``'internal_count'`` : number of records from the training data that fall into this non-leaf node
            - ``'internal_weight'`` : total weight of all nodes that fall into this non-leaf node
            - ``'leaf_count'`` : number of records from the training data that fall into this leaf node
-            - ``'leaf_weight'`` : total weight (sum of hessian) of all observations that fall into this leaf node
+            - ``'leaf_weight'`` : total weight (sum of Hessian) of all observations that fall into this leaf node
            - ``'data_percentage'`` : percentage of training data that fall into this node
    precision : int or None, optional (default=3)
        Used to restrict the display of floating point values to a certain precision.

--- a/python-package/lightgbm/sklearn.py
+++ b/python-package/lightgbm/sklearn.py
@@ -6,7 +6,7 @@ from typing import Any, Callable, Dict, List, Optional, Tuple, Union

 import numpy as np

-from .basic import Booster, Dataset, LightGBMError, _ArrayLike, _choose_param_value, _ConfigAliases, _log_warning
+from .basic import Booster, Dataset, LightGBMError, _choose_param_value, _ConfigAliases, _log_warning
 from .callback import record_evaluation
 from .compat import (SKLEARN_INSTALLED, LGBMNotFittedError, _LGBMAssertAllFinite, _LGBMCheckArray,
                     _LGBMCheckClassificationTargets, _LGBMCheckSampleWeight, _LGBMCheckXY, _LGBMClassifierBase,
@@ -19,11 +19,11 @@ _EvalResultType = Tuple[str, float, bool]
 _LGBM_ScikitCustomObjectiveFunction = Union[
    Callable[
        [np.ndarray, np.ndarray],
-        Tuple[_ArrayLike, _ArrayLike]
+        Tuple[np.ndarray, np.ndarray]
    ],
    Callable[
        [np.ndarray, np.ndarray, np.ndarray],
-        Tuple[_ArrayLike, _ArrayLike]
+        Tuple[np.ndarray, np.ndarray]
    ],
 ]
 _LGBM_ScikitCustomEvalFunction = Union[
@@ -72,13 +72,13 @@ class _ObjectiveFunctionWrapper:
                grad : numpy 1-D array of shape = [n_samples] or numpy 2-D array of shape [n_samples, n_classes] (for multi-class task)
                    The value of the first order derivative (gradient) of the loss
                    with respect to the elements of y_pred for each sample point.
-                hess : numpy 1-D array of shape = [n_samples] or numpy 2-D array of shape [n_samples, n_classes] (for multi-class task)
+                hess : numpy 1-D array of shape = [n_samples] or numpy 2-D array of shape = [n_samples, n_classes] (for multi-class task)
                    The value of the second order derivative (Hessian) of the loss
                    with respect to the elements of y_pred for each sample point.

        .. note::

-            For multi-class task, preds are a [n_samples, n_classes] numpy 2-D array,
+            For multi-class task, y_pred is a numpy 2-D array of shape = [n_samples, n_classes],
            and grad and hess should be returned in the same format.
        """
        self.func = func
@@ -95,10 +95,10 @@ class _ObjectiveFunctionWrapper:

        Returns
        -------
-        grad : numpy 1-D array of shape = [n_samples] or numpy 2-D array of shape [n_samples, n_classes] (for multi-class task)
+        grad : numpy 1-D array of shape = [n_samples] or numpy 2-D array of shape = [n_samples, n_classes] (for multi-class task)
            The value of the first order derivative (gradient) of the loss
            with respect to the elements of preds for each sample point.
-        hess : numpy 1-D array of shape = [n_samples] or numpy 2-D array of shape [n_samples, n_classes] (for multi-class task)
+        hess : numpy 1-D array of shape = [n_samples] or numpy 2-D array of shape = [n_samples, n_classes] (for multi-class task)
            The value of the second order derivative (Hessian) of the loss
            with respect to the elements of preds for each sample point.
        """
@@ -162,11 +162,6 @@ class _EvalFunctionWrapper:
                    The eval result.
                is_higher_better : bool
                    Is eval result higher better, e.g. AUC is ``is_higher_better``.
-
-        .. note::
-
-            For multi-class task, preds are a [n_samples, n_classes] numpy 2-D array,
-            and grad and hess should be returned in the same format.
        """
        self.func = func

@@ -297,9 +292,6 @@ _lgbmmodel_doc_custom_eval_note = """
            The eval result.
        is_higher_better : bool
            Is eval result higher better, e.g. AUC is ``is_higher_better``.
-
-    For multi-class task, preds are a [n_samples, n_classes] numpy 2-D array,
-    and grad and hess should be returned in the same format.
 """

 _lgbmmodel_doc_predict = (
@@ -415,7 +407,7 @@ class LGBMModel(_LGBMModelBase):
        min_split_gain : float, optional (default=0.)
            Minimum loss reduction required to make a further partition on a leaf node of the tree.
        min_child_weight : float, optional (default=1e-3)
-            Minimum sum of instance weight (hessian) needed in a child (leaf).
+            Minimum sum of instance weight (Hessian) needed in a child (leaf).
        min_child_samples : int, optional (default=20)
            Minimum number of data needed in a child (leaf).
        subsample : float, optional (default=1.)
@@ -473,7 +465,7 @@ class LGBMModel(_LGBMModelBase):
                The value of the second order derivative (Hessian) of the loss
                with respect to the elements of y_pred for each sample point.

-        For multi-class task, preds are a [n_samples, n_classes] numpy 2-D array,
+        For multi-class task, y_pred is a numpy 2-D array of shape = [n_samples, n_classes],
        and grad and hess should be returned in the same format.
        """
        if not SKLEARN_INSTALLED: