[python-package] use 2d collections for predictions, grads and hess in...

[python-package] use 2d collections for predictions, grads and hess in multiclass custom objective (#4925) * reshape predictions, grad and hess in multiclass custom objective * add sklearn test. move custom obj to utils. docs for numpy * use num_model_per_iteration to get num_classes * update docs and dask multiclass custom objective test * move reshaping to __inner_predict. add test for feval * add missing note. remove extra line

[python-package] use 2d collections for predictions, grads and hess in...
[python-package] use 2d collections for predictions, grads and hess in multiclass custom objective (#4925) * reshape predictions, grad and hess in multiclass custom objective * add sklearn test. move custom obj to utils. docs for numpy * use num_model_per_iteration to get num_classes * update docs and dask multiclass custom objective test * move reshaping to __inner_predict. add test for feval * add missing note. remove extra line
d670a4d6 · José Morales · GitHub · caa087bc · d670a4d6 · d670a4d6
Unverified Commit d670a4d6 authored Feb 22, 2022 by José Morales Committed by GitHub Feb 23, 2022
6 changed files
--- a/python-package/lightgbm/basic.py
+++ b/python-package/lightgbm/basic.py
@@ -2947,22 +2947,21 @@ class Booster:
            Should accept two parameters: preds, train_data,
            and return (grad, hess).

-                preds : numpy 1-D array
+                preds : numpy 1-D array or numpy 2-D array (for multi-class task)
                    The predicted values.
                    Predicted values are returned before any transformation,
                    e.g. they are raw margin instead of probability of positive class for binary task.
                train_data : Dataset
                    The training dataset.
-                grad : list, numpy 1-D array or pandas Series
+                grad : numpy 1-D array or numpy 2-D array (for multi-class task)
                    The value of the first order derivative (gradient) of the loss
                    with respect to the elements of preds for each sample point.
-                hess : list, numpy 1-D array or pandas Series
+                hess : numpy 1-D array or numpy 2-D array (for multi-class task)
                    The value of the second order derivative (Hessian) of the loss
                    with respect to the elements of preds for each sample point.

-            For multi-class task, the preds is group by class_id first, then group by row_id.
-            If you want to get i-th row preds in j-th class, the access way is score[j * num_data + i]
-            and you should group grad and hess in this way as well.
+            For multi-class task, preds are a [n_samples, n_classes] numpy 2-D array,
+            and grad and hess should be returned in the same format.

        Returns
        -------
@@ -3000,6 +2999,9 @@ class Booster:
            if not self.__set_objective_to_none:
                self.reset_parameter({"objective": "none"}).__set_objective_to_none = True
            grad, hess = fobj(self.__inner_predict(0), self.train_set)
+            if self.num_model_per_iteration() > 1:
+                grad = grad.ravel(order='F')
+                hess = hess.ravel(order='F')
            return self.__boost(grad, hess)

    def __boost(self, grad, hess):
@@ -3009,16 +3011,15 @@ class Booster:

            Score is returned before any transformation,
            e.g. it is raw margin instead of probability of positive class for binary task.
-            For multi-class task, the score is group by class_id first, then group by row_id.
-            If you want to get i-th row score in j-th class, the access way is score[j * num_data + i]
-            and you should group grad and hess in this way as well.
+            For multi-class task, preds are a [n_samples, n_classes] numpy 2-D array,
+            and grad and hess should be returned in the same format.

        Parameters
        ----------
-        grad : list, numpy 1-D array or pandas Series
+        grad : numpy 1-D array or numpy 2-D array (for multi-class task)
            The value of the first order derivative (gradient) of the loss
            with respect to the elements of score for each sample point.
-        hess : list, numpy 1-D array or pandas Series
+        hess : numpy 1-D array or numpy 2-D array (for multi-class task)
            The value of the second order derivative (Hessian) of the loss
            with respect to the elements of score for each sample point.

@@ -3160,8 +3161,8 @@ class Booster:
                is_higher_better : bool
                    Is eval result higher better, e.g. AUC is ``is_higher_better``.

-            For multi-class task, the preds is group by class_id first, then group by row_id.
-            If you want to get i-th row preds in j-th class, the access way is preds[j * num_data + i].
+            For multi-class task, preds are a [n_samples, n_classes] numpy 2-D array,
+            and grad and hess should be returned in the same format.

        Returns
        -------
@@ -3195,7 +3196,7 @@ class Booster:
            Should accept two parameters: preds, eval_data,
            and return (eval_name, eval_result, is_higher_better) or list of such tuples.

-                preds : numpy 1-D array
+                preds : numpy 1-D array or numpy 2-D array (for multi-class task)
                    The predicted values.
                    If ``fobj`` is specified, predicted values are returned before any transformation,
                    e.g. they are raw margin instead of probability of positive class for binary task in this case.
@@ -3208,8 +3209,8 @@ class Booster:
                is_higher_better : bool
                    Is eval result higher better, e.g. AUC is ``is_higher_better``.

-            For multi-class task, the preds is group by class_id first, then group by row_id.
-            If you want to get i-th row preds in j-th class, the access way is preds[j * num_data + i].
+            For multi-class task, preds are a [n_samples, n_classes] numpy 2-D array,
+            and grad and hess should be returned in the same format.

        Returns
        -------
@@ -3228,7 +3229,7 @@ class Booster:
            Should accept two parameters: preds, eval_data,
            and return (eval_name, eval_result, is_higher_better) or list of such tuples.

-                preds : numpy 1-D array
+                preds : numpy 1-D array or numpy 2-D array (for multi-class task)
                    The predicted values.
                    If ``fobj`` is specified, predicted values are returned before any transformation,
                    e.g. they are raw margin instead of probability of positive class for binary task in this case.
@@ -3241,8 +3242,8 @@ class Booster:
                is_higher_better : bool
                    Is eval result higher better, e.g. AUC is ``is_higher_better``.

-            For multi-class task, the preds is group by class_id first, then group by row_id.
-            If you want to get i-th row preds in j-th class, the access way is preds[j * num_data + i].
+            For multi-class task, preds are a [n_samples, n_classes] numpy 2-D array,
+            and grad and hess should be returned in the same format.

        Returns
        -------
@@ -3868,7 +3869,11 @@ class Booster:
            if tmp_out_len.value != len(self.__inner_predict_buffer[data_idx]):
                raise ValueError(f"Wrong length of predict results for data {data_idx}")
            self.__is_predicted_cur_iter[data_idx] = True
-        return self.__inner_predict_buffer[data_idx]
+        result = self.__inner_predict_buffer[data_idx]
+        if self.__num_class > 1:
+            num_data = result.size // self.__num_class
+            result = result.reshape(num_data, self.__num_class, order='F')
+        return result

    def __get_eval_info(self):
        """Get inner evaluation count and names."""

--- a/python-package/lightgbm/sklearn.py
+++ b/python-package/lightgbm/sklearn.py
@@ -59,7 +59,7 @@ class _ObjectiveFunctionWrapper:

                y_true : numpy 1-D array of shape = [n_samples]
                    The target values.
-                y_pred : numpy 1-D array of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task)
+                y_pred : numpy 1-D array of shape = [n_samples] or numpy 2-D array of shape = [n_samples, n_classes] (for multi-class task)
                    The predicted values.
                    Predicted values are returned before any transformation,
                    e.g. they are raw margin instead of probability of positive class for binary task.
@@ -69,18 +69,17 @@ class _ObjectiveFunctionWrapper:
                    sum(group) = n_samples.
                    For example, if you have a 100-document dataset with ``group = [10, 20, 40, 10, 10, 10]``, that means that you have 6 groups,
                    where the first 10 records are in the first group, records 11-30 are in the second group, records 31-70 are in the third group, etc.
-                grad : list, numpy 1-D array or pandas Series of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task)
+                grad : numpy 1-D array of shape = [n_samples] or numpy 2-D array of shape [n_samples, n_classes] (for multi-class task)
                    The value of the first order derivative (gradient) of the loss
                    with respect to the elements of y_pred for each sample point.
-                hess : list, numpy 1-D array or pandas Series of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task)
+                hess : numpy 1-D array of shape = [n_samples] or numpy 2-D array of shape [n_samples, n_classes] (for multi-class task)
                    The value of the second order derivative (Hessian) of the loss
                    with respect to the elements of y_pred for each sample point.

        .. note::

-            For multi-class task, the y_pred is group by class_id first, then group by row_id.
-            If you want to get i-th row y_pred in j-th class, the access way is y_pred[j * num_data + i]
-            and you should group grad and hess in this way as well.
+            For multi-class task, preds are a [n_samples, n_classes] numpy 2-D array,
+            and grad and hess should be returned in the same format.
        """
        self.func = func

@@ -89,17 +88,17 @@ class _ObjectiveFunctionWrapper:

        Parameters
        ----------
-        preds : numpy 1-D array of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task)
+        preds : numpy 1-D array of shape = [n_samples] or numpy 2-D array of shape = [n_samples, n_classes] (for multi-class task)
            The predicted values.
        dataset : Dataset
            The training dataset.

        Returns
        -------
-        grad : list, numpy 1-D array or pandas Series of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task)
+        grad : numpy 1-D array of shape = [n_samples] or numpy 2-D array of shape [n_samples, n_classes] (for multi-class task)
            The value of the first order derivative (gradient) of the loss
            with respect to the elements of preds for each sample point.
-        hess : list, numpy 1-D array or pandas Series of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task)
+        hess : numpy 1-D array of shape = [n_samples] or numpy 2-D array of shape [n_samples, n_classes] (for multi-class task)
            The value of the second order derivative (Hessian) of the loss
            with respect to the elements of preds for each sample point.
        """
@@ -114,20 +113,13 @@ class _ObjectiveFunctionWrapper:
        """weighted for objective"""
        weight = dataset.get_weight()
        if weight is not None:
-            """only one class"""
-            if len(weight) == len(grad):
-                grad = np.multiply(grad, weight)
-                hess = np.multiply(hess, weight)
-            else:
-                num_data = len(weight)
-                num_class = len(grad) // num_data
-                if num_class * num_data != len(grad):
-                    raise ValueError("Length of grad and hess should equal to num_class * num_data")
-                for k in range(num_class):
-                    for i in range(num_data):
-                        idx = k * num_data + i
-                        grad[idx] *= weight[i]
-                        hess[idx] *= weight[i]
+            if grad.ndim == 2:  # multi-class
+                num_data = grad.shape[0]
+                if weight.size != num_data:
+                    raise ValueError("grad and hess should be of shape [n_samples, n_classes]")
+                weight = weight.reshape(num_data, 1)
+            grad *= weight
+            hess *= weight
        return grad, hess


@@ -152,7 +144,7 @@ class _EvalFunctionWrapper:

                y_true : numpy 1-D array of shape = [n_samples]
                    The target values.
-                y_pred : numpy 1-D array of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task)
+                y_pred : numpy 1-D array of shape = [n_samples] or numpy 2-D array shape = [n_samples, n_classes] (for multi-class task)
                    The predicted values.
                    In case of custom ``objective``, predicted values are returned before any transformation,
                    e.g. they are raw margin instead of probability of positive class for binary task in this case.
@@ -173,8 +165,8 @@ class _EvalFunctionWrapper:

        .. note::

-            For multi-class task, the y_pred is group by class_id first, then group by row_id.
-            If you want to get i-th row y_pred in j-th class, the access way is y_pred[j * num_data + i].
+            For multi-class task, preds are a [n_samples, n_classes] numpy 2-D array,
+            and grad and hess should be returned in the same format.
        """
        self.func = func

@@ -183,7 +175,7 @@ class _EvalFunctionWrapper:

        Parameters
        ----------
-        preds : numpy 1-D array of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task)
+        preds : numpy 1-D array of shape = [n_samples] or numpy 2-D array of shape = [n_samples, n_classes] (for multi-class task)
            The predicted values.
        dataset : Dataset
            The training dataset.
@@ -287,7 +279,7 @@ _lgbmmodel_doc_custom_eval_note = """

        y_true : numpy 1-D array of shape = [n_samples]
            The target values.
-        y_pred : numpy 1-D array of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task)
+        y_pred : numpy 1-D array of shape = [n_samples] or numpy 2-D array of shape = [n_samples, n_classes] (for multi-class task)
            The predicted values.
            In case of custom ``objective``, predicted values are returned before any transformation,
            e.g. they are raw margin instead of probability of positive class for binary task in this case.
@@ -306,8 +298,8 @@ _lgbmmodel_doc_custom_eval_note = """
        is_higher_better : bool
            Is eval result higher better, e.g. AUC is ``is_higher_better``.

-    For multi-class task, the y_pred is group by class_id first, then group by row_id.
-    If you want to get i-th row y_pred in j-th class, the access way is y_pred[j * num_data + i].
+    For multi-class task, preds are a [n_samples, n_classes] numpy 2-D array,
+    and grad and hess should be returned in the same format.
 """

 _lgbmmodel_doc_predict = (
@@ -464,7 +456,7 @@ class LGBMModel(_LGBMModelBase):

            y_true : numpy 1-D array of shape = [n_samples]
                The target values.
-            y_pred : numpy 1-D array of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task)
+            y_pred : numpy 1-D array of shape = [n_samples] or numpy 2-D array of shape = [n_samples, n_classes] (for multi-class task)
                The predicted values.
                Predicted values are returned before any transformation,
                e.g. they are raw margin instead of probability of positive class for binary task.
@@ -474,16 +466,15 @@ class LGBMModel(_LGBMModelBase):
                sum(group) = n_samples.
                For example, if you have a 100-document dataset with ``group = [10, 20, 40, 10, 10, 10]``, that means that you have 6 groups,
                where the first 10 records are in the first group, records 11-30 are in the second group, records 31-70 are in the third group, etc.
-            grad : list, numpy 1-D array or pandas Series of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task)
+            grad : numpy 1-D array of shape = [n_samples] or numpy 2-D array of shape = [n_samples, n_classes] (for multi-class task)
                The value of the first order derivative (gradient) of the loss
                with respect to the elements of y_pred for each sample point.
-            hess : list, numpy 1-D array or pandas Series of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task)
+            hess : numpy 1-D array of shape = [n_samples] or numpy 2-D array of shape = [n_samples, n_classes] (for multi-class task)
                The value of the second order derivative (Hessian) of the loss
                with respect to the elements of y_pred for each sample point.

-        For multi-class task, the y_pred is group by class_id first, then group by row_id.
-        If you want to get i-th row y_pred in j-th class, the access way is y_pred[j * num_data + i]
-        and you should group grad and hess in this way as well.
+        For multi-class task, preds are a [n_samples, n_classes] numpy 2-D array,
+        and grad and hess should be returned in the same format.
        """
        if not SKLEARN_INSTALLED:
            raise LightGBMError('scikit-learn is required for lightgbm.sklearn. '

--- a/tests/python_package_test/test_basic.py
+++ b/tests/python_package_test/test_basic.py
@@ -7,13 +7,14 @@ from pathlib import Path
 import numpy as np
 import pytest
 from scipy import sparse
-from sklearn.datasets import dump_svmlight_file, load_svmlight_file
+from sklearn.datasets import dump_svmlight_file, load_svmlight_file, make_blobs
+from sklearn.metrics import log_loss
 from sklearn.model_selection import train_test_split

 import lightgbm as lgb
 from lightgbm.compat import PANDAS_INSTALLED, pd_DataFrame, pd_Series

-from .utils import load_breast_cancer
+from .utils import load_breast_cancer, sklearn_multiclass_custom_objective, softmax


 def test_basic(tmp_path):
@@ -587,7 +588,7 @@ def _bad_gradients(preds, _):


 def _good_gradients(preds, _):
-    return np.random.randn(len(preds)), np.random.rand(len(preds))
+    return np.random.randn(*preds.shape), np.random.rand(*preds.shape)


 def test_custom_objective_safety():
@@ -609,3 +610,51 @@ def test_custom_objective_safety():
    good_bst_multi.update(fobj=_good_gradients)
    with pytest.raises(ValueError, match=re.escape(f"number of models per one iteration ({nclass})")):
        bad_bst_multi.update(fobj=_bad_gradients)
+
+
+def test_multiclass_custom_objective():
+    def custom_obj(y_pred, ds):
+        y_true = ds.get_label()
+        return sklearn_multiclass_custom_objective(y_true, y_pred)
+
+    centers = [[-4, -4], [4, 4], [-4, 4]]
+    X, y = make_blobs(n_samples=1_000, centers=centers, random_state=42)
+    ds = lgb.Dataset(X, y)
+    params = {'objective': 'multiclass', 'num_class': 3, 'num_leaves': 7}
+    builtin_obj_bst = lgb.train(params, ds, num_boost_round=10)
+    builtin_obj_preds = builtin_obj_bst.predict(X)
+
+    custom_obj_bst = lgb.train(params, ds, num_boost_round=10, fobj=custom_obj)
+    custom_obj_preds = softmax(custom_obj_bst.predict(X))
+
+    np.testing.assert_allclose(builtin_obj_preds, custom_obj_preds, rtol=0.01)
+
+
+def test_multiclass_custom_eval():
+    def custom_eval(y_pred, ds):
+        y_true = ds.get_label()
+        return 'custom_logloss', log_loss(y_true, y_pred), False
+
+    centers = [[-4, -4], [4, 4], [-4, 4]]
+    X, y = make_blobs(n_samples=1_000, centers=centers, random_state=42)
+    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=0)
+    train_ds = lgb.Dataset(X_train, y_train)
+    valid_ds = lgb.Dataset(X_valid, y_valid, reference=train_ds)
+    params = {'objective': 'multiclass', 'num_class': 3, 'num_leaves': 7}
+    eval_result = {}
+    bst = lgb.train(
+        params,
+        train_ds,
+        num_boost_round=10,
+        valid_sets=[train_ds, valid_ds],
+        valid_names=['train', 'valid'],
+        feval=custom_eval,
+        callbacks=[lgb.record_evaluation(eval_result)],
+        keep_training_booster=True,
+    )
+
+    for key, ds in zip(['train', 'valid'], [train_ds, valid_ds]):
+        np.testing.assert_allclose(eval_result[key]['multi_logloss'], eval_result[key]['custom_logloss'])
+        _, metric, value, _ = bst.eval(ds, key, feval=custom_eval)[1]  # first element is multi_logloss
+        assert metric == 'custom_logloss'
+        np.testing.assert_allclose(value, eval_result[key][metric][-1])
--- a/tests/python_package_test/test_dask.py
+++ b/tests/python_package_test/test_dask.py
@@ -15,6 +15,8 @@ import pytest

 import lightgbm as lgb

+from .utils import sklearn_multiclass_custom_objective
+
 if not platform.startswith('linux'):
    pytest.skip('lightgbm.dask is currently supported in Linux environments', allow_module_level=True)
 if machine() != 'x86_64':
@@ -271,25 +273,6 @@ def _objective_logistic_regression(y_true, y_pred):
    return grad, hess


-def _objective_logloss(y_true, y_pred):
-    num_rows = len(y_true)
-    num_class = len(np.unique(y_true))
-    # operate on preds as [num_data, num_classes] matrix
-    y_pred = y_pred.reshape(-1, num_class, order='F')
-    row_wise_max = np.max(y_pred, axis=1).reshape(num_rows, 1)
-    preds = y_pred - row_wise_max
-    prob = np.exp(preds) / np.sum(np.exp(preds), axis=1).reshape(num_rows, 1)
-    grad_update = np.zeros_like(preds)
-    grad_update[np.arange(num_rows), y_true.astype(np.int32)] = -1.0
-    grad = prob + grad_update
-    factor = num_class / (num_class - 1)
-    hess = factor * prob * (1 - prob)
-    # reshape back to 1-D array, grouped by class id and then row id
-    grad = grad.T.reshape(-1)
-    hess = hess.T.reshape(-1)
-    return grad, hess
-
-
 @pytest.mark.parametrize('output', data_output)
 @pytest.mark.parametrize('task', ['binary-classification', 'multiclass-classification'])
 @pytest.mark.parametrize('boosting_type', boosting_types)
@@ -507,7 +490,7 @@ def test_classifier_custom_objective(output, task, cluster):
            })
        elif task == 'multiclass-classification':
            params.update({
-                'objective': _objective_logloss,
+                'objective': sklearn_multiclass_custom_objective,
                'num_classes': 3
            })


--- a/tests/python_package_test/test_sklearn.py
+++ b/tests/python_package_test/test_sklearn.py
@@ -7,7 +7,7 @@ import joblib
 import numpy as np
 import pytest
 from sklearn.base import clone
-from sklearn.datasets import load_svmlight_file, make_multilabel_classification
+from sklearn.datasets import load_svmlight_file, make_blobs, make_multilabel_classification
 from sklearn.ensemble import StackingClassifier, StackingRegressor
 from sklearn.metrics import log_loss, mean_squared_error
 from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
@@ -18,7 +18,7 @@ from sklearn.utils.validation import check_is_fitted
 import lightgbm as lgb

 from .utils import (load_boston, load_breast_cancer, load_digits, load_iris, load_linnerud, make_ranking,
-                    make_synthetic_regression)
+                    make_synthetic_regression, sklearn_multiclass_custom_objective, softmax)

 decreasing_generator = itertools.count(0, -1)

@@ -1280,3 +1280,20 @@ def test_training_succeeds_when_data_is_dataframe_and_label_is_column_array(task
    preds_1d = model_1d.predict(X)
    preds_2d = model_2d.predict(X)
    np.testing.assert_array_equal(preds_1d, preds_2d)
+
+
+def test_multiclass_custom_objective():
+    centers = [[-4, -4], [4, 4], [-4, 4]]
+    X, y = make_blobs(n_samples=1_000, centers=centers, random_state=42)
+    params = {'n_estimators': 10, 'num_leaves': 7}
+    builtin_obj_model = lgb.LGBMClassifier(**params)
+    builtin_obj_model.fit(X, y)
+    builtin_obj_preds = builtin_obj_model.predict_proba(X)
+
+    custom_obj_model = lgb.LGBMClassifier(objective=sklearn_multiclass_custom_objective, **params)
+    custom_obj_model.fit(X, y)
+    custom_obj_preds = softmax(custom_obj_model.predict(X, raw_score=True))
+
+    np.testing.assert_allclose(builtin_obj_preds, custom_obj_preds, rtol=0.01)
+    assert not callable(builtin_obj_model.objective_)
+    assert callable(custom_obj_model.objective_)
--- a/tests/python_package_test/utils.py
+++ b/tests/python_package_test/utils.py
@@ -114,3 +114,20 @@ def make_ranking(n_samples=100, n_features=20, n_informative=5, gmax=2,
 @lru_cache(maxsize=None)
 def make_synthetic_regression(n_samples=100):
    return sklearn.datasets.make_regression(n_samples, n_features=4, n_informative=2, random_state=42)
+
+
+def softmax(x):
+    row_wise_max = np.max(x, axis=1).reshape(-1, 1)
+    exp_x = np.exp(x - row_wise_max)
+    return exp_x / np.sum(exp_x, axis=1).reshape(-1, 1)
+
+
+def sklearn_multiclass_custom_objective(y_true, y_pred):
+    num_rows, num_class = y_pred.shape
+    prob = softmax(y_pred)
+    grad_update = np.zeros_like(prob)
+    grad_update[np.arange(num_rows), y_true.astype(np.int32)] = -1.0
+    grad = prob + grad_update
+    factor = num_class / (num_class - 1)
+    hess = factor * prob * (1 - prob)
+    return grad, hess