[ci] [python-package] update pre-commit hooks to latest versions (#6817)

81922a7e · James Lamb · GitHub · 2db0b25e · 81922a7e · 81922a7e
Unverified Commit 81922a7e authored Feb 07, 2025 by James Lamb Committed by GitHub Feb 07, 2025
13 changed files
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -24,7 +24,7 @@ repos:
        args: ["--strict"]
  - repo: https://github.com/astral-sh/ruff-pre-commit
    # Ruff version.
-    rev: v0.8.3
+    rev: v0.9.5
    hooks:
      # Run the linter.
      - id: ruff
@@ -39,7 +39,7 @@ repos:
    hooks:
      - id: shellcheck
  - repo: https://github.com/crate-ci/typos
-    rev: v1.28.3
+    rev: v1.29.5
    hooks:
      - id: typos
        args: ["--force-exclude"]

--- a/examples/parallel_learning/train.conf
+++ b/examples/parallel_learning/train.conf
@@ -11,7 +11,7 @@ boosting_type = gbdt
 # alias: application, app
 objective = binary
-# eval metrics, support multi metric, delimite by ',' , support following metrics
+# eval metrics, support multi metric, delimited by ',' , support following metrics
 # l1
 # l2 , default metric for regression
 # ndcg , default metric for lambdarank

--- a/examples/regression/train.conf
+++ b/examples/regression/train.conf
@@ -11,7 +11,7 @@ boosting_type = gbdt
 # alias: application, app
 objective = regression
-# eval metrics, support multi metric, delimite by ',' , support following metrics
+# eval metrics, support multi metric, delimited by ',' , support following metrics
 # l1
 # l2 , default metric for regression
 # ndcg , default metric for lambdarank

--- a/examples/xendcg/train.conf
+++ b/examples/xendcg/train.conf
@@ -11,7 +11,7 @@ boosting_type = gbdt
 # alias: application, app
 objective = rank_xendcg
-# eval metrics, support multi metric, delimite by ',' , support following metrics
+# eval metrics, support multi metric, delimited by ',' , support following metrics
 # l1
 # l2 , default metric for regression
 # ndcg , default metric for lambdarank

--- a/python-package/lightgbm/basic.py
+++ b/python-package/lightgbm/basic.py
@@ -381,7 +381,7 @@ def _list_to_1d_numpy(
        return np.asarray(data, dtype=dtype)  # SparseArray should be supported as well
    else:
        raise TypeError(
-            f"Wrong type({type(data).__name__}) for {name}.\n" "It should be list, numpy 1-D array or pandas Series"
+            f"Wrong type({type(data).__name__}) for {name}.\nIt should be list, numpy 1-D array or pandas Series"
        )
@@ -803,8 +803,7 @@ def _check_for_bad_pandas_dtypes(pandas_dtypes_series: pd_Series) -> None:
    ]
    if bad_pandas_dtypes:
        raise ValueError(
-            'pandas dtypes must be int, float or bool.\n'
+            f"pandas dtypes must be int, float or bool.\nFields with bad pandas dtypes: {', '.join(bad_pandas_dtypes)}"
-            f'Fields with bad pandas dtypes: {", ".join(bad_pandas_dtypes)}'
        )
@@ -3298,7 +3297,7 @@ class Dataset:
                    self.data = np.array(list(self._yield_row_from_seqlist(self.data, self.used_indices)))
                else:
                    _log_warning(
-                        f"Cannot subset {type(self.data).__name__} type of raw data.\n" "Returning original raw data"
+                        f"Cannot subset {type(self.data).__name__} type of raw data.\nReturning original raw data"
                    )
            self._need_slice = False
        if self.data is None:
@@ -3718,7 +3717,7 @@ class Booster:
            self.model_from_string(model_str)
        else:
            raise TypeError(
-                "Need at least one training dataset or model file or model string " "to create Booster instance"
+                "Need at least one training dataset or model file or model string to create Booster instance"
            )
        self.params = params
@@ -4052,7 +4051,7 @@ class Booster:
        if not isinstance(data, Dataset):
            raise TypeError(f"Validation data should be Dataset instance, met {type(data).__name__}")
        if data._predictor is not self.__init_predictor:
-            raise LightGBMError("Add validation data failed, " "you should use same predictor for these data")
+            raise LightGBMError("Add validation data failed, you should use same predictor for these data")
        _safe_call(
            _LIB.LGBM_BoosterAddValidData(
                self._handle,
@@ -4138,7 +4137,7 @@ class Booster:
            if not isinstance(train_set, Dataset):
                raise TypeError(f"Training data should be Dataset instance, met {type(train_set).__name__}")
            if train_set._predictor is not self.__init_predictor:
-                raise LightGBMError("Replace training data failed, " "you should use same predictor for these data")
+                raise LightGBMError("Replace training data failed, you should use same predictor for these data")
            self.train_set = train_set
            _safe_call(
                _LIB.LGBM_BoosterResetTrainingData(

--- a/python-package/lightgbm/callback.py
+++ b/python-package/lightgbm/callback.py
@@ -393,7 +393,7 @@ class _EarlyStoppingCallback:
            if self.verbose:
                best_score_str = "\t".join([_format_eval_result(x, show_stdv=True) for x in self.best_score_list[i]])
                _log_info(
-                    "Did not meet early stopping. " f"Best iteration is:\n[{self.best_iter[i] + 1}]\t{best_score_str}"
+                    f"Did not meet early stopping. Best iteration is:\n[{self.best_iter[i] + 1}]\t{best_score_str}"
                )
                if self.first_metric_only:
                    _log_info(f"Evaluated only: {metric_name}")

--- a/python-package/lightgbm/dask.py
+++ b/python-package/lightgbm/dask.py
@@ -1166,7 +1166,7 @@ class DaskLGBMClassifier(LGBMClassifier, _DaskLGBMModel):
    _before_kwargs, _kwargs, _after_kwargs = _base_doc.partition("**kwargs")  # type: ignore
    __init__.__doc__ = f"""
        {_before_kwargs}client : dask.distributed.Client or None, optional (default=None)
-        {' ':4}Dask client. If ``None``, ``distributed.default_client()`` will be used at runtime. The Dask client used by this class will not be saved if the model object is pickled.
+        {" ":4}Dask client. If ``None``, ``distributed.default_client()`` will be used at runtime. The Dask client used by this class will not be saved if the model object is pickled.
        {_kwargs}{_after_kwargs}
        """
@@ -1221,7 +1221,7 @@ class DaskLGBMClassifier(LGBMClassifier, _DaskLGBMModel):
    _base_doc = _base_doc[: _base_doc.find("eval_group :")] + _base_doc[_base_doc.find("eval_metric :") :]
    # DaskLGBMClassifier support for callbacks and init_model is not tested
-    fit.__doc__ = f"""{_base_doc[:_base_doc.find('callbacks :')]}**kwargs
+    fit.__doc__ = f"""{_base_doc[: _base_doc.find("callbacks :")]}**kwargs
        Other parameters passed through to ``LGBMClassifier.fit()``.
    Returns
@@ -1369,7 +1369,7 @@ class DaskLGBMRegressor(LGBMRegressor, _DaskLGBMModel):
    _before_kwargs, _kwargs, _after_kwargs = _base_doc.partition("**kwargs")  # type: ignore
    __init__.__doc__ = f"""
        {_before_kwargs}client : dask.distributed.Client or None, optional (default=None)
-        {' ':4}Dask client. If ``None``, ``distributed.default_client()`` will be used at runtime. The Dask client used by this class will not be saved if the model object is pickled.
+        {" ":4}Dask client. If ``None``, ``distributed.default_client()`` will be used at runtime. The Dask client used by this class will not be saved if the model object is pickled.
        {_kwargs}{_after_kwargs}
        """
@@ -1424,7 +1424,7 @@ class DaskLGBMRegressor(LGBMRegressor, _DaskLGBMModel):
    _base_doc = _base_doc[: _base_doc.find("eval_group :")] + _base_doc[_base_doc.find("eval_metric :") :]
    # DaskLGBMRegressor support for callbacks and init_model is not tested
-    fit.__doc__ = f"""{_base_doc[:_base_doc.find('callbacks :')]}**kwargs
+    fit.__doc__ = f"""{_base_doc[: _base_doc.find("callbacks :")]}**kwargs
        Other parameters passed through to ``LGBMRegressor.fit()``.
    Returns
@@ -1536,7 +1536,7 @@ class DaskLGBMRanker(LGBMRanker, _DaskLGBMModel):
    _before_kwargs, _kwargs, _after_kwargs = _base_doc.partition("**kwargs")  # type: ignore
    __init__.__doc__ = f"""
        {_before_kwargs}client : dask.distributed.Client or None, optional (default=None)
-        {' ':4}Dask client. If ``None``, ``distributed.default_client()`` will be used at runtime. The Dask client used by this class will not be saved if the model object is pickled.
+        {" ":4}Dask client. If ``None``, ``distributed.default_client()`` will be used at runtime. The Dask client used by this class will not be saved if the model object is pickled.
        {_kwargs}{_after_kwargs}
        """
@@ -1596,11 +1596,11 @@ class DaskLGBMRanker(LGBMRanker, _DaskLGBMModel):
        _base_doc[: _base_doc.find("feature_name :")]
        + "eval_at : list or tuple of int, optional (default=(1, 2, 3, 4, 5))\n"
        + f"{' ':8}The evaluation positions of the specified metric.\n"
-        + f"{' ':4}{_base_doc[_base_doc.find('feature_name :'):]}"
+        + f"{' ':4}{_base_doc[_base_doc.find('feature_name :') :]}"
    )
    # DaskLGBMRanker support for callbacks and init_model is not tested
-    fit.__doc__ = f"""{_base_doc[:_base_doc.find('callbacks :')]}**kwargs
+    fit.__doc__ = f"""{_base_doc[: _base_doc.find("callbacks :")]}**kwargs
        Other parameters passed through to ``LGBMRanker.fit()``.
    Returns

--- a/python-package/lightgbm/plotting.py
+++ b/python-package/lightgbm/plotting.py
@@ -247,7 +247,7 @@ def plot_split_value_histogram(
    hist, split_bins = booster.get_split_value_histogram(feature=feature, bins=bins, xgboost_style=False)
    if np.count_nonzero(hist) == 0:
-        raise ValueError("Cannot plot split value histogram, " f"because feature {feature} was not used in splitting")
+        raise ValueError(f"Cannot plot split value histogram, because feature {feature} was not used in splitting")
    width = width_coef * (split_bins[1] - split_bins[0])
    centred = (split_bins[:-1] + split_bins[1:]) / 2

--- a/src/treelearner/parallel_tree_learner.h
+++ b/src/treelearner/parallel_tree_learner.h
@@ -47,7 +47,7 @@ class FeatureParallelTreeLearner: public TREELEARNER_T {
 /*!
 * \brief Data parallel learning algorithm.
 *        Workers use local data to construct histograms locally, then sync up global histograms.
-*        It is recommonded used when #data is large or #feature is small
+*        It is recommended used when #data is large or #feature is small
 */
 template <typename TREELEARNER_T>
 class DataParallelTreeLearner: public TREELEARNER_T {

--- a/tests/python_package_test/test_arrow.py
+++ b/tests/python_package_test/test_arrow.py
@@ -20,9 +20,9 @@ if os.getenv("ALLOW_SKIP_ARROW_TESTS") == "1":
 else:
    import pyarrow as pa  # type: ignore
-    assert (
+    assert lgb.compat.PYARROW_INSTALLED is True, (
-        lgb.compat.PYARROW_INSTALLED is True
+        "'pyarrow' and its dependencies must be installed to run the arrow tests"
-    ), "'pyarrow' and its dependencies must be installed to run the arrow tests"
+    )
 # ----------------------------------------------------------------------------------------------- #
 #                                            UTILITIES                                            #

--- a/tests/python_package_test/test_engine.py
+++ b/tests/python_package_test/test_engine.py
@@ -2168,8 +2168,7 @@ def test_monotone_constraints(test_with_categorical_variable):
    trainset = generate_trainset_for_monotone_constraints_tests(test_with_categorical_variable)
    for test_with_interaction_constraints in [True, False]:
        error_msg = (
-            "Model not correctly constrained "
+            f"Model not correctly constrained (test_with_interaction_constraints={test_with_interaction_constraints})"
-            f"(test_with_interaction_constraints={test_with_interaction_constraints})"
        )
        for monotone_constraints_method in ["basic", "intermediate", "advanced"]:
            params = {

--- a/tests/python_package_test/test_sklearn.py
+++ b/tests/python_package_test/test_sklearn.py
@@ -1425,9 +1425,9 @@ def test_getting_feature_names_in_np_input(estimator_class):
 def test_getting_feature_names_in_pd_input(estimator_class):
    X, y = load_digits(n_class=2, return_X_y=True, as_frame=True)
    col_names = X.columns.to_list()
-    assert isinstance(col_names, list) and all(
+    assert isinstance(col_names, list) and all(isinstance(c, str) for c in col_names), (
-        isinstance(c, str) for c in col_names
+        "input data must have feature names for this test to cover the expected functionality"
-    ), "input data must have feature names for this test to cover the expected functionality"
+    )
    params = {"n_estimators": 2, "num_leaves": 7}
    if estimator_class is lgb.LGBMModel:
        model = estimator_class(**{**params, "objective": "binary"})

--- a/tests/python_package_test/utils.py
+++ b/tests/python_package_test/utils.py
@@ -251,12 +251,12 @@ def assert_subtree_valid(root):
    right_child = root["right_child"]
    (l_w, l_c) = assert_subtree_valid(left_child)
    (r_w, r_c) = assert_subtree_valid(right_child)
-    assert (
+    assert abs(root["internal_weight"] - (l_w + r_w)) <= 1e-3, (
-        abs(root["internal_weight"] - (l_w + r_w)) <= 1e-3
+        "root node's internal weight should be approximately the sum of its child nodes' internal weights"
-    ), "root node's internal weight should be approximately the sum of its child nodes' internal weights"
+    )
-    assert (
+    assert root["internal_count"] == l_c + r_c, (
-        root["internal_count"] == l_c + r_c
+        "root node's internal count should be exactly the sum of its child nodes' internal counts"
-    ), "root node's internal count should be exactly the sum of its child nodes' internal counts"
+    )
    return (root["internal_weight"], root["internal_count"])