[python-package] drop support for h2o datatable (#6894)

ec5492f8 · James Lamb · GitHub · a7253603 · ec5492f8 · ec5492f8
Unverified Commit ec5492f8 authored Apr 27, 2025 by James Lamb Committed by GitHub Apr 27, 2025
6 changed files
--- a/docs/Python-Intro.rst
+++ b/docs/Python-Intro.rst
@@ -35,7 +35,7 @@ The LightGBM Python module can load data from:

 -  LibSVM (zero-based) / TSV / CSV format text file

-  NumPy 2D array(s), pandas DataFrame, H2O DataTable's Frame (deprecated), SciPy sparse matrix
+-  NumPy 2D array(s), pandas DataFrame, pyarrow Table, SciPy sparse matrix

 -  LightGBM binary file


--- a/docs/conf.py
+++ b/docs/conf.py
@@ -100,7 +100,6 @@ autodoc_default_options = {
 autodoc_mock_imports = [
    "dask",
    "dask.distributed",
-    "datatable",
    "graphviz",
    "matplotlib",
    "numpy",

--- a/python-package/lightgbm/basic.py
+++ b/python-package/lightgbm/basic.py
@@ -35,7 +35,6 @@ from .compat import (
    arrow_is_floating,
    arrow_is_integer,
    concat,
-    dt_DataTable,
    pa_Array,
    pa_chunked_array,
    pa_ChunkedArray,
@@ -116,7 +115,6 @@ _LGBM_TrainDataType = Union[
    Path,
    np.ndarray,
    pd_DataFrame,
-    dt_DataTable,
    scipy.sparse.spmatrix,
    "Sequence",
    List["Sequence"],
@@ -137,7 +135,6 @@ _LGBM_PredictDataType = Union[
    Path,
    np.ndarray,
    pd_DataFrame,
-    dt_DataTable,
    scipy.sparse.spmatrix,
    pa_Table,
 ]
@@ -577,15 +574,6 @@ class LGBMDeprecationWarning(FutureWarning):
    pass


-def _emit_datatable_deprecation_warning() -> None:
-    msg = (
-        "Support for 'datatable' in LightGBM is deprecated, and will be removed in a future release. "
-        "To avoid this warning, convert 'datatable' inputs to a supported format "
-        "(for example, use the 'to_numpy()' method)."
-    )
-    warnings.warn(msg, category=LGBMDeprecationWarning, stacklevel=2)
-
-
 class _ConfigAliases:
    # lazy evaluation to allow import without dynamic library, e.g., for docs generation
    aliases = None
@@ -1112,7 +1100,7 @@ class _InnerPredictor:

        Parameters
        ----------
-        data : str, pathlib.Path, numpy array, pandas DataFrame, pyarrow Table, H2O DataTable's Frame (deprecated) or scipy.sparse
+        data : str, pathlib.Path, numpy array, pandas DataFrame, pyarrow Table or scipy.sparse
            Data source for prediction.
            If str or pathlib.Path, it represents the path to a text file (CSV, TSV, or LibSVM).
        start_iteration : int, optional (default=0)
@@ -1225,14 +1213,6 @@ class _InnerPredictor:
                num_iteration=num_iteration,
                predict_type=predict_type,
            )
-        elif isinstance(data, dt_DataTable):
-            _emit_datatable_deprecation_warning()
-            preds, nrow = self.__pred_for_np2d(
-                mat=data.to_numpy(),
-                start_iteration=start_iteration,
-                num_iteration=num_iteration,
-                predict_type=predict_type,
-            )
        else:
            try:
                _log_warning("Converting data to scipy sparse matrix.")
@@ -1790,7 +1770,7 @@ class Dataset:

        Parameters
        ----------
-        data : str, pathlib.Path, numpy array, pandas DataFrame, H2O DataTable's Frame (deprecated), scipy.sparse, Sequence, list of Sequence, list of numpy array or pyarrow Table
+        data : str, pathlib.Path, numpy array, pandas DataFrame, scipy.sparse, Sequence, list of Sequence, list of numpy array or pyarrow Table
            Data source of Dataset.
            If str or pathlib.Path, it represents the path to a text file (CSV, TSV, or LibSVM) or a LightGBM Dataset binary file.
        label : list, numpy 1-D array, pandas Series / one-column DataFrame, pyarrow Array, pyarrow ChunkedArray or None, optional (default=None)
@@ -2196,9 +2176,6 @@ class Dataset:
                raise TypeError("Data list can only be of ndarray or Sequence")
        elif isinstance(data, Sequence):
            self.__init_from_seqs([data], ref_dataset)
-        elif isinstance(data, dt_DataTable):
-            _emit_datatable_deprecation_warning()
-            self.__init_from_np2d(data.to_numpy(), params_str, ref_dataset)
        else:
            try:
                csr = scipy.sparse.csr_matrix(data)
@@ -2619,7 +2596,7 @@ class Dataset:

        Parameters
        ----------
-        data : str, pathlib.Path, numpy array, pandas DataFrame, H2O DataTable's Frame (deprecated), scipy.sparse, Sequence, list of Sequence or list of numpy array
+        data : str, pathlib.Path, numpy array, pandas DataFrame, scipy.sparse, Sequence, list of Sequence or list of numpy array
            Data source of Dataset.
            If str or pathlib.Path, it represents the path to a text file (CSV, TSV, or LibSVM) or a LightGBM Dataset binary file.
        label : list, numpy 1-D array, pandas Series / one-column DataFrame, pyarrow Array, pyarrow ChunkedArray or None, optional (default=None)
@@ -3276,7 +3253,7 @@ class Dataset:

        Returns
        -------
-        data : str, pathlib.Path, numpy array, pandas DataFrame, H2O DataTable's Frame (deprecated), scipy.sparse, Sequence, list of Sequence or list of numpy array or None
+        data : str, pathlib.Path, numpy array, pandas DataFrame, scipy.sparse, Sequence, list of Sequence or list of numpy array or None
            Raw data used in the Dataset construction.
        """
        if self._handle is None:
@@ -3288,9 +3265,6 @@ class Dataset:
                    self.data = self.data[self.used_indices, :]
                elif isinstance(self.data, pd_DataFrame):
                    self.data = self.data.iloc[self.used_indices].copy()
-                elif isinstance(self.data, dt_DataTable):
-                    _emit_datatable_deprecation_warning()
-                    self.data = self.data[self.used_indices, :]
                elif isinstance(self.data, Sequence):
                    self.data = self.data[self.used_indices]
                elif _is_list_of_sequences(self.data) and len(self.data) > 0:
@@ -3477,9 +3451,6 @@ class Dataset:
                    self.data = np.hstack((self.data, other.data.toarray()))
                elif isinstance(other.data, pd_DataFrame):
                    self.data = np.hstack((self.data, other.data.values))
-                elif isinstance(other.data, dt_DataTable):
-                    _emit_datatable_deprecation_warning()
-                    self.data = np.hstack((self.data, other.data.to_numpy()))
                else:
                    self.data = None
            elif isinstance(self.data, scipy.sparse.spmatrix):
@@ -3488,9 +3459,6 @@ class Dataset:
                    self.data = scipy.sparse.hstack((self.data, other.data), format=sparse_format)
                elif isinstance(other.data, pd_DataFrame):
                    self.data = scipy.sparse.hstack((self.data, other.data.values), format=sparse_format)
-                elif isinstance(other.data, dt_DataTable):
-                    _emit_datatable_deprecation_warning()
-                    self.data = scipy.sparse.hstack((self.data, other.data.to_numpy()), format=sparse_format)
                else:
                    self.data = None
            elif isinstance(self.data, pd_DataFrame):
@@ -3506,21 +3474,6 @@ class Dataset:
                    self.data = concat((self.data, pd_DataFrame(other.data.toarray())), axis=1, ignore_index=True)
                elif isinstance(other.data, pd_DataFrame):
                    self.data = concat((self.data, other.data), axis=1, ignore_index=True)
-                elif isinstance(other.data, dt_DataTable):
-                    _emit_datatable_deprecation_warning()
-                    self.data = concat((self.data, pd_DataFrame(other.data.to_numpy())), axis=1, ignore_index=True)
-                else:
-                    self.data = None
-            elif isinstance(self.data, dt_DataTable):
-                _emit_datatable_deprecation_warning()
-                if isinstance(other.data, np.ndarray):
-                    self.data = dt_DataTable(np.hstack((self.data.to_numpy(), other.data)))
-                elif isinstance(other.data, scipy.sparse.spmatrix):
-                    self.data = dt_DataTable(np.hstack((self.data.to_numpy(), other.data.toarray())))
-                elif isinstance(other.data, pd_DataFrame):
-                    self.data = dt_DataTable(np.hstack((self.data.to_numpy(), other.data.values)))
-                elif isinstance(other.data, dt_DataTable):
-                    self.data = dt_DataTable(np.hstack((self.data.to_numpy(), other.data.to_numpy())))
                else:
                    self.data = None
            else:
@@ -4717,7 +4670,7 @@ class Booster:

        Parameters
        ----------
-        data : str, pathlib.Path, numpy array, pandas DataFrame, pyarrow Table, H2O DataTable's Frame (deprecated) or scipy.sparse
+        data : str, pathlib.Path, numpy array, pandas DataFrame, pyarrow Table or scipy.sparse
            Data source for prediction.
            If str or pathlib.Path, it represents the path to a text file (CSV, TSV, or LibSVM).
        start_iteration : int, optional (default=0)
@@ -4798,7 +4751,7 @@ class Booster:

        Parameters
        ----------
-        data : str, pathlib.Path, numpy array, pandas DataFrame, H2O DataTable's Frame (deprecated), scipy.sparse, Sequence, list of Sequence or list of numpy array
+        data : str, pathlib.Path, numpy array, pandas DataFrame, scipy.sparse, Sequence, list of Sequence or list of numpy array
            Data source for refit.
            If str or pathlib.Path, it represents the path to a text file (CSV, TSV, or LibSVM).
        label : list, numpy 1-D array, pandas Series / one-column DataFrame, pyarrow Array or pyarrow ChunkedArray

--- a/python-package/lightgbm/compat.py
+++ b/python-package/lightgbm/compat.py
@@ -202,25 +202,6 @@ try:
 except ImportError:
    GRAPHVIZ_INSTALLED = False

-"""datatable"""
-try:
-    import datatable
-
-    if hasattr(datatable, "Frame"):
-        dt_DataTable = datatable.Frame
-    else:
-        dt_DataTable = datatable.DataTable
-    DATATABLE_INSTALLED = True
-except ImportError:
-    DATATABLE_INSTALLED = False
-
-    class dt_DataTable:  # type: ignore
-        """Dummy class for datatable.DataTable."""
-
-        def __init__(self, *args: Any, **kwargs: Any):
-            pass
-
-
 """dask"""
 try:
    from dask import delayed

--- a/python-package/lightgbm/sklearn.py
+++ b/python-package/lightgbm/sklearn.py
@@ -41,7 +41,6 @@ from .compat import (
    _LGBMRegressorBase,
    _LGBMValidateData,
    _sklearn_version,
-    dt_DataTable,
    pd_DataFrame,
 )
 from .engine import train
@@ -58,7 +57,6 @@ __all__ = [
 ]

 _LGBM_ScikitMatrixLike = Union[
-    dt_DataTable,
    List[Union[List[float], List[int]]],
    np.ndarray,
    pd_DataFrame,
@@ -945,7 +943,7 @@ class LGBMModel(_LGBMModelBase):
        params["metric"] = [e for e in eval_metrics_builtin if e not in params["metric"]] + params["metric"]
        params["metric"] = [metric for metric in params["metric"] if metric is not None]

-        if not isinstance(X, (pd_DataFrame, dt_DataTable)):
+        if not isinstance(X, pd_DataFrame):
            _X, _y = _LGBMValidateData(
                self,
                X,
@@ -1077,7 +1075,7 @@ class LGBMModel(_LGBMModelBase):

    fit.__doc__ = (
        _lgbmmodel_doc_fit.format(
-            X_shape="numpy array, pandas DataFrame, H2O DataTable's Frame (deprecated), scipy.sparse, list of lists of int or float of shape = [n_samples, n_features]",
+            X_shape="numpy array, pandas DataFrame, scipy.sparse, list of lists of int or float of shape = [n_samples, n_features]",
            y_shape="numpy array, pandas DataFrame, pandas Series, list of int or float of shape = [n_samples]",
            sample_weight_shape="numpy array, pandas Series, list of int or float of shape = [n_samples] or None, optional (default=None)",
            init_score_shape="numpy array, pandas DataFrame, pandas Series, list of int or float of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task) or shape = [n_samples, n_classes] (for multi-class task) or None, optional (default=None)",
@@ -1104,7 +1102,7 @@ class LGBMModel(_LGBMModelBase):
        """Docstring is set after definition, using a template."""
        if not self.__sklearn_is_fitted__():
            raise LGBMNotFittedError("Estimator not fitted, call fit before exploiting the model.")
-        if not isinstance(X, (pd_DataFrame, dt_DataTable)):
+        if not isinstance(X, pd_DataFrame):
            X = _LGBMValidateData(
                self,
                X,
@@ -1154,7 +1152,7 @@ class LGBMModel(_LGBMModelBase):

    predict.__doc__ = _lgbmmodel_doc_predict.format(
        description="Return the predicted value for each sample.",
-        X_shape="numpy array, pandas DataFrame, H2O DataTable's Frame (deprecated), scipy.sparse, list of lists of int or float of shape = [n_samples, n_features]",
+        X_shape="numpy array, pandas DataFrame, scipy.sparse, list of lists of int or float of shape = [n_samples, n_features]",
        output_name="predicted_result",
        predicted_result_shape="array-like of shape = [n_samples] or shape = [n_samples, n_classes]",
        X_leaves_shape="array-like of shape = [n_samples, n_trees] or shape = [n_samples, n_trees * n_classes]",
@@ -1648,7 +1646,7 @@ class LGBMClassifier(_LGBMClassifierBase, LGBMModel):

    predict_proba.__doc__ = _lgbmmodel_doc_predict.format(
        description="Return the predicted probability for each class for each sample.",
-        X_shape="numpy array, pandas DataFrame, H2O DataTable's Frame (deprecated), scipy.sparse, list of lists of int or float of shape = [n_samples, n_features]",
+        X_shape="numpy array, pandas DataFrame, scipy.sparse, list of lists of int or float of shape = [n_samples, n_features]",
        output_name="predicted_probability",
        predicted_result_shape="array-like of shape = [n_samples] or shape = [n_samples, n_classes]",
        X_leaves_shape="array-like of shape = [n_samples, n_trees] or shape = [n_samples, n_trees * n_classes]",

--- a/tests/python_package_test/test_sklearn.py
+++ b/tests/python_package_test/test_sklearn.py
@@ -24,10 +24,8 @@ from sklearn.utils.validation import check_is_fitted
 import lightgbm as lgb
 from lightgbm.compat import (
    DASK_INSTALLED,
-    DATATABLE_INSTALLED,
    PANDAS_INSTALLED,
    _sklearn_version,
-    dt_DataTable,
    pd_DataFrame,
    pd_Series,
 )
@@ -1883,14 +1881,12 @@ def test_predict_rejects_inputs_with_incorrect_number_of_features(predict_disabl
        assert preds.shape[0] == y.shape[0]


-@pytest.mark.parametrize("X_type", ["dt_DataTable", "list2d", "numpy", "scipy_csc", "scipy_csr", "pd_DataFrame"])
+@pytest.mark.parametrize("X_type", ["list2d", "numpy", "scipy_csc", "scipy_csr", "pd_DataFrame"])
 @pytest.mark.parametrize("y_type", ["list1d", "numpy", "pd_Series", "pd_DataFrame"])
 @pytest.mark.parametrize("task", ["binary-classification", "multiclass-classification", "regression"])
 def test_classification_and_regression_minimally_work_with_all_all_accepted_data_types(X_type, y_type, task, rng):
    if any(t.startswith("pd_") for t in [X_type, y_type]) and not PANDAS_INSTALLED:
        pytest.skip("pandas is not installed")
-    if any(t.startswith("dt_") for t in [X_type, y_type]) and not DATATABLE_INSTALLED:
-        pytest.skip("datatable is not installed")
    X, y, g = _create_data(task, n_samples=2_000)
    weights = np.abs(rng.standard_normal(size=(y.shape[0],)))

@@ -1902,9 +1898,7 @@ def test_classification_and_regression_minimally_work_with_all_all_accepted_data
        raise ValueError(f"Unrecognized task '{task}'")

    X_valid = X * 2
-    if X_type == "dt_DataTable":
-        X = dt_DataTable(X)
-    elif X_type == "list2d":
+    if X_type == "list2d":
        X = X.tolist()
    elif X_type == "scipy_csc":
        X = scipy.sparse.csc_matrix(X)
@@ -1960,22 +1954,18 @@ def test_classification_and_regression_minimally_work_with_all_all_accepted_data
        raise ValueError(f"Unrecognized task: '{task}'")


-@pytest.mark.parametrize("X_type", ["dt_DataTable", "list2d", "numpy", "scipy_csc", "scipy_csr", "pd_DataFrame"])
+@pytest.mark.parametrize("X_type", ["list2d", "numpy", "scipy_csc", "scipy_csr", "pd_DataFrame"])
 @pytest.mark.parametrize("y_type", ["list1d", "numpy", "pd_DataFrame", "pd_Series"])
 @pytest.mark.parametrize("g_type", ["list1d_float", "list1d_int", "numpy", "pd_Series"])
 def test_ranking_minimally_works_with_all_all_accepted_data_types(X_type, y_type, g_type, rng):
    if any(t.startswith("pd_") for t in [X_type, y_type, g_type]) and not PANDAS_INSTALLED:
        pytest.skip("pandas is not installed")
-    if any(t.startswith("dt_") for t in [X_type, y_type, g_type]) and not DATATABLE_INSTALLED:
-        pytest.skip("datatable is not installed")
    X, y, g = _create_data(task="ranking", n_samples=1_000)
    weights = np.abs(rng.standard_normal(size=(y.shape[0],)))
    init_score = np.full_like(y, np.mean(y))
    X_valid = X * 2

-    if X_type == "dt_DataTable":
-        X = dt_DataTable(X)
-    elif X_type == "list2d":
+    if X_type == "list2d":
        X = X.tolist()
    elif X_type == "scipy_csc":
        X = scipy.sparse.csc_matrix(X)