[ci] [python-package] use ruff, enforce flake8-bugbear and flake8-comprehensions checks (#5871)

d47006f4 · James Lamb · GitHub · 452370ac · d47006f4 · d47006f4
Unverified Commit d47006f4 authored May 16, 2023 by James Lamb Committed by GitHub May 15, 2023
11 changed files
--- a/.ci/lint-python.sh
+++ b/.ci/lint-python.sh
 #!/bin/sh

-echo "running flake8"
-flake8 \
-    --config=./python-package/setup.cfg \
-    . \
-|| exit -1
-echo "done running flake8"
-
-echo "running pydocstyle"
-pydocstyle \
+echo "running ruff"
+ruff check \
    --config=./python-package/pyproject.toml \
    . \
 || exit -1
-echo "done running pydocstyle"
+echo "done running ruff"

 echo "running isort"
 isort \

--- a/.ci/test.sh
+++ b/.ci/test.sh
@@ -71,11 +71,10 @@ if [[ $TASK == "lint" ]]; then
        ${CONDA_PYTHON_REQUIREMENT} \
        cmakelint \
        cpplint \
-        flake8 \
        isort \
        mypy \
-        pydocstyle \
-        "r-lintr>=3.0"
+        'r-lintr>=3.0' \
+        ruff
    source activate $CONDA_ENV
    echo "Linting Python code"
    sh ${BUILD_DIRECTORY}/.ci/lint-python.sh || exit -1

--- a/python-package/lightgbm/basic.py
+++ b/python-package/lightgbm/basic.py
@@ -2,6 +2,7 @@
 """Wrapper for C API of LightGBM."""
 import abc
 import ctypes
+import inspect
 import json
 import warnings
 from collections import OrderedDict
@@ -996,8 +997,8 @@ class _InnerPredictor:
        elif isinstance(data, list):
            try:
                data = np.array(data)
-            except BaseException:
-                raise ValueError('Cannot convert data list to numpy array.')
+            except BaseException as err:
+                raise ValueError('Cannot convert data list to numpy array.') from err
            preds, nrow = self.__pred_for_np2d(
                mat=data,
                start_iteration=start_iteration,
@@ -1015,8 +1016,8 @@ class _InnerPredictor:
            try:
                _log_warning('Converting data to scipy sparse matrix.')
                csr = scipy.sparse.csr_matrix(data)
-            except BaseException:
-                raise TypeError(f'Cannot predict data for type {type(data).__name__}')
+            except BaseException as err:
+                raise TypeError(f'Cannot predict data for type {type(data).__name__}') from err
            preds, nrow = self.__pred_for_csr(
                csr=csr,
                start_iteration=start_iteration,
@@ -1802,9 +1803,7 @@ class Dataset:

        # process for args
        params = {} if params is None else params
-        args_names = (getattr(self.__class__, '_lazy_init')
-                      .__code__
-                      .co_varnames[:getattr(self.__class__, '_lazy_init').__code__.co_argcount])
+        args_names = inspect.signature(self.__class__._lazy_init).parameters.keys()
        for key in params.keys():
            if key in args_names:
                _log_warning(f'{key} keyword has been found in `params` and will be ignored.\n'
@@ -1868,8 +1867,8 @@ class Dataset:
            try:
                csr = scipy.sparse.csr_matrix(data)
                self.__init_from_csr(csr, params_str, ref_dataset)
-            except BaseException:
-                raise TypeError(f'Cannot initialize Dataset from {type(data).__name__}')
+            except BaseException as err:
+                raise TypeError(f'Cannot initialize Dataset from {type(data).__name__}') from err
        if label is not None:
            self.set_label(label)
        if self.get_label() is None:
@@ -1920,7 +1919,7 @@ class Dataset:
        indices = self._create_sample_indices(total_nrow)

        # Select sampled rows, transpose to column order.
-        sampled = np.array([row for row in self._yield_row_from_seqlist(seqs, indices)])
+        sampled = np.array(list(self._yield_row_from_seqlist(seqs, indices)))
        sampled = sampled.T

        filtered = []
@@ -2777,7 +2776,7 @@ class Dataset:
                elif isinstance(self.data, Sequence):
                    self.data = self.data[self.used_indices]
                elif isinstance(self.data, list) and len(self.data) > 0 and all(isinstance(x, Sequence) for x in self.data):
-                    self.data = np.array([row for row in self._yield_row_from_seqlist(self.data, self.used_indices)])
+                    self.data = np.array(list(self._yield_row_from_seqlist(self.data, self.used_indices)))
                else:
                    _log_warning(f"Cannot subset {type(self.data).__name__} type of raw data.\n"
                                 "Returning original raw data")

--- a/python-package/lightgbm/callback.py
+++ b/python-package/lightgbm/callback.py
@@ -301,7 +301,7 @@ class _EarlyStoppingCallback:

        self._reset_storages()

-        n_metrics = len(set(m[1] for m in env.evaluation_result_list))
+        n_metrics = len({m[1] for m in env.evaluation_result_list})
        n_datasets = len(env.evaluation_result_list) // n_metrics
        if isinstance(self.min_delta, list):
            if not all(t >= 0 for t in self.min_delta):

--- a/python-package/lightgbm/dask.py
+++ b/python-package/lightgbm/dask.py
@@ -787,7 +787,7 @@ def _train(
    else:
        if listen_port_in_params:
            _log_info("Using passed-in 'local_listen_port' for all workers")
-            unique_hosts = set(urlparse(a).hostname for a in worker_addresses)
+            unique_hosts = {urlparse(a).hostname for a in worker_addresses}
            if len(unique_hosts) < len(worker_addresses):
                msg = (
                    "'local_listen_port' was provided in Dask training parameters, but at least one "

--- a/python-package/pyproject.toml
+++ b/python-package/pyproject.toml
 [tool.isort]
 line_length = 120
 skip_glob = [
-    "external_libs/*",
-    "lightgbm-python/*"
+    "*/external_libs/*",
+    "*/lightgbm-python/*"
 ]

 [tool.mypy]
 exclude = 'build/*|compile/*|docs/*|examples/*|external_libs/*|lightgbm-python/*|tests/*'
 ignore_missing_imports = true

-[tool.pydocstyle]
-add_ignore = [
-    'D105'
+[tool.ruff]
+exclude = [
+    "build",
+    "compile",
+    "docs",
+    "external_libs",
+    "lightgbm-python",
+    "setup.py"
 ]
-convention = 'numpy'
-match = '(?!^test_|setup).*\.py'
-match_dir = '^(?!^external_libs|lightgbm-python|test|example).*'
+ignore = [
+    # (pydocstyle) Missing docstring in magic method
+    "D105",
+    # (pycodestyle) Line too long
+    "E501"
+]
+select = [
+    # flake8-bugbear
+    "B",
+    # flake8-comprehensions
+    "C4",
+    # pydocstyle
+    "D",
+    # pycodestyle
+    "E",
+    # pyflakes
+    "F"
+]
+
+# this should be set to the oldest version of python LightGBM supports
+target-version = "py37"
+
+[tool.ruff.per-file-ignores]
+"examples/*" = [
+    # pydocstyle
+    "D"
+]
+"tests/*" = [
+    # (flake8-bugbear) Found useless expression
+    "B018",
+    # pydocstyle
+    "D"
+]
+
+[tool.ruff.pydocstyle]
+
+convention = "numpy"
--- a/tests/distributed/_test_distributed.py
+++ b/tests/distributed/_test_distributed.py
@@ -106,7 +106,7 @@ class DistributedMockup:
        for i, partition in enumerate(partitions):
            np.savetxt(str(TESTS_DIR / f'train{i}.txt'), partition, delimiter=',')

-    def fit(self, partitions: List[np.ndarray], train_config: Dict = {}) -> None:
+    def fit(self, partitions: List[np.ndarray], train_config: Dict) -> None:
        """Run the distributed training process on a single machine.

        For each worker i:
@@ -134,7 +134,7 @@ class DistributedMockup:
            if result.returncode != 0:
                raise RuntimeError('Error in training')

-    def predict(self, predict_config: Dict[str, Any] = {}) -> np.ndarray:
+    def predict(self, predict_config: Dict[str, Any]) -> np.ndarray:
        """Compute the predictions using the model created in the fit step.

        predict_config is used to predict the training set train.txt
@@ -178,7 +178,7 @@ def test_classifier(executable):
    }
    clf = DistributedMockup(executable)
    clf.fit(partitions, train_params)
-    y_probas = clf.predict()
+    y_probas = clf.predict(predict_config={})
    y_pred = y_probas > 0.5
    assert accuracy_score(clf.label_, y_pred) == 1.

@@ -194,5 +194,5 @@ def test_regressor(executable):
    }
    reg = DistributedMockup(executable)
    reg.fit(partitions, train_params)
-    y_pred = reg.predict()
+    y_pred = reg.predict(predict_config={})
    np.testing.assert_allclose(y_pred, reg.label_, rtol=0.2, atol=50.)
--- a/tests/python_package_test/test_basic.py
+++ b/tests/python_package_test/test_basic.py
@@ -2,6 +2,7 @@
 import filecmp
 import numbers
 import re
+from copy import deepcopy
 from os import getenv
 from pathlib import Path

@@ -324,7 +325,7 @@ def test_add_features_same_booster_behaviour(tmp_path):
        d.set_label(y)
        b1 = lgb.Booster(train_set=d1)
        b = lgb.Booster(train_set=d)
-        for k in range(10):
+        for _ in range(10):
            b.update()
            b1.update()
        dname = tmp_path / "d.txt"
@@ -365,7 +366,7 @@ def test_add_features_from_different_sources():

        # test that method works for different data types
        d1 = lgb.Dataset(x_1, feature_name=names, free_raw_data=False).construct()
-        res_feature_names = [name for name in names]
+        res_feature_names = deepcopy(names)
        for idx, x_2 in enumerate(xxs, 2):
            original_type = type(d1.get_data())
            d2 = lgb.Dataset(x_2, feature_name=names, free_raw_data=False).construct()
@@ -407,7 +408,7 @@ def test_cegb_affects_behavior(tmp_path):
    ds = lgb.Dataset(X, feature_name=names).construct()
    ds.set_label(y)
    base = lgb.Booster(train_set=ds)
-    for k in range(10):
+    for _ in range(10):
        base.update()
    basename = tmp_path / "basename.txt"
    base.save_model(basename)
@@ -419,7 +420,7 @@ def test_cegb_affects_behavior(tmp_path):
             {'cegb_penalty_split': 1}]
    for case in cases:
        booster = lgb.Booster(train_set=ds, params=case)
-        for k in range(10):
+        for _ in range(10):
            booster.update()
        casename = tmp_path / "casename.txt"
        booster.save_model(casename)
@@ -445,7 +446,7 @@ def test_cegb_scaling_equalities(tmp_path):
    for (p1, p2) in pairs:
        booster1 = lgb.Booster(train_set=ds, params=p1)
        booster2 = lgb.Booster(train_set=ds, params=p2)
-        for k in range(10):
+        for _ in range(10):
            booster1.update()
            booster2.update()
        p1name = tmp_path / "p1.txt"
@@ -752,10 +753,10 @@ def test_feature_num_bin(min_data_in_bin):
    ]).T
    n_continuous = X.shape[1] - 1
    feature_name = [f'x{i}' for i in range(n_continuous)] + ['cat1']
-    ds_kwargs = dict(
-        params={'min_data_in_bin': min_data_in_bin},
-        categorical_feature=[n_continuous],  # last feature
-    )
+    ds_kwargs = {
+        "params": {'min_data_in_bin': min_data_in_bin},
+        "categorical_feature": [n_continuous],  # last feature
+    }
    ds = lgb.Dataset(X, feature_name=feature_name, **ds_kwargs).construct()
    expected_num_bins = [
        100 // min_data_in_bin + 1,  # extra bin for zero

--- a/tests/python_package_test/test_dask.py
+++ b/tests/python_package_test/test_dask.py
@@ -1062,9 +1062,9 @@ def test_eval_set_no_early_stopping(task, output, eval_sizes, eval_names_prefix,
                eval_class_weight.append({0: n_neg / n_pos, 1: n_pos / n_neg})
                init_score_value = np.log(np.mean(y_e) / (1 - np.mean(y_e)))
                if 'dataframe' in output:
-                    d_init_score = dy_e.map_partitions(lambda x: pd.Series([init_score_value] * x.size))
+                    d_init_score = dy_e.map_partitions(lambda x, val=init_score_value: pd.Series([val] * x.size))
                else:
-                    d_init_score = dy_e.map_blocks(lambda x: np.repeat(init_score_value, x.size))
+                    d_init_score = dy_e.map_blocks(lambda x, val=init_score_value: np.repeat(val, x.size))

                eval_init_score.append(d_init_score)


--- a/tests/python_package_test/test_engine.py
+++ b/tests/python_package_test/test_engine.py
@@ -886,13 +886,13 @@ def test_early_stopping_min_delta(first_only, single_metric, greater_is_better):
        min_delta = metric2min_delta[metric[0]]
    else:
        min_delta = [metric2min_delta[m] for m in metric]
-    train_kwargs = dict(
-        params=params,
-        train_set=train_ds,
-        num_boost_round=50,
-        valid_sets=[train_ds, valid_ds],
-        valid_names=['training', 'valid'],
-    )
+    train_kwargs = {
+        "params": params,
+        "train_set": train_ds,
+        "num_boost_round": 50,
+        "valid_sets": [train_ds, valid_ds],
+        "valid_names": ['training', 'valid'],
+    }

    # regular early stopping
    evals_result = {}
@@ -1771,7 +1771,7 @@ def test_monotone_constraints(test_with_categorical_variable):
            for tree in tree_str:
                # split_features are in 4th line.
                features = tree.splitlines()[3].split("=")[1].split(" ")
-                features = set(f"Column_{f}" for f in features)
+                features = {f"Column_{f}" for f in features}
                feature_sets.append(features)
            return np.array(feature_sets)

@@ -2860,14 +2860,14 @@ def test_early_stopping_for_only_first_metric():
    iter_valid1_l2 = 3
    iter_valid2_l1 = 3
    iter_valid2_l2 = 15
-    assert len(set([iter_valid1_l1, iter_valid1_l2, iter_valid2_l1, iter_valid2_l2])) == 2
+    assert len({iter_valid1_l1, iter_valid1_l2, iter_valid2_l1, iter_valid2_l2}) == 2
    iter_min_l1 = min([iter_valid1_l1, iter_valid2_l1])
    iter_min_l2 = min([iter_valid1_l2, iter_valid2_l2])
    iter_min_valid1 = min([iter_valid1_l1, iter_valid1_l2])

    iter_cv_l1 = 15
    iter_cv_l2 = 13
-    assert len(set([iter_cv_l1, iter_cv_l2])) == 2
+    assert len({iter_cv_l1, iter_cv_l2}) == 2
    iter_cv_min = min([iter_cv_l1, iter_cv_l2])

    # test for lgb.train

--- a/tests/python_package_test/test_sklearn.py
+++ b/tests/python_package_test/test_sklearn.py
@@ -313,20 +313,24 @@ def test_grid_search():
    y = y.astype(str)  # utilize label encoder at it's max power
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42)
-    params = dict(subsample=0.8,
-                  subsample_freq=1)
-    grid_params = dict(boosting_type=['rf', 'gbdt'],
-                       n_estimators=[4, 6],
-                       reg_alpha=[0.01, 0.005])
+    params = {
+        "subsample": 0.8,
+        "subsample_freq": 1
+    }
+    grid_params = {
+        "boosting_type": ['rf', 'gbdt'],
+        "n_estimators": [4, 6],
+        "reg_alpha": [0.01, 0.005]
+    }
    evals_result = {}
-    fit_params = dict(
-        eval_set=[(X_val, y_val)],
-        eval_metric=constant_metric,
-        callbacks=[
+    fit_params = {
+        "eval_set": [(X_val, y_val)],
+        "eval_metric": constant_metric,
+        "callbacks": [
            lgb.early_stopping(2),
            lgb.record_evaluation(evals_result)
        ]
-    )
+    }
    grid = GridSearchCV(estimator=lgb.LGBMClassifier(**params), param_grid=grid_params, cv=2)
    grid.fit(X_train, y_train, **fit_params)
    score = grid.score(X_test, y_test)  # utilizes GridSearchCV default refit=True
@@ -350,14 +354,20 @@ def test_random_search():
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1,
                                                      random_state=42)
    n_iter = 3  # Number of samples
-    params = dict(subsample=0.8,
-                  subsample_freq=1)
-    param_dist = dict(boosting_type=['rf', 'gbdt'],
-                      n_estimators=[np.random.randint(low=3, high=10) for i in range(n_iter)],
-                      reg_alpha=[np.random.uniform(low=0.01, high=0.06) for i in range(n_iter)])
-    fit_params = dict(eval_set=[(X_val, y_val)],
-                      eval_metric=constant_metric,
-                      callbacks=[lgb.early_stopping(2)])
+    params = {
+        "subsample": 0.8,
+        "subsample_freq": 1
+    }
+    param_dist = {
+        "boosting_type": ['rf', 'gbdt'],
+        "n_estimators": [np.random.randint(low=3, high=10) for i in range(n_iter)],
+        "reg_alpha": [np.random.uniform(low=0.01, high=0.06) for i in range(n_iter)]
+    }
+    fit_params = {
+        "eval_set": [(X_val, y_val)],
+        "eval_metric": constant_metric,
+        "callbacks": [lgb.early_stopping(2)]
+    }
    rand = RandomizedSearchCV(estimator=lgb.LGBMClassifier(**params),
                              param_distributions=param_dist, cv=2,
                              n_iter=n_iter, random_state=42)
@@ -1139,7 +1149,7 @@ def test_first_metric_only():
    iter_valid1_l2 = 4
    iter_valid2_l1 = 2
    iter_valid2_l2 = 2
-    assert len(set([iter_valid1_l1, iter_valid1_l2, iter_valid2_l1, iter_valid2_l2])) == 2
+    assert len({iter_valid1_l1, iter_valid1_l2, iter_valid2_l1, iter_valid2_l2}) == 2
    iter_min_l1 = min([iter_valid1_l1, iter_valid2_l1])
    iter_min_l2 = min([iter_valid1_l2, iter_valid2_l2])
    iter_min = min([iter_min_l1, iter_min_l2])