Unverified Commit d47006f4 authored by James Lamb's avatar James Lamb Committed by GitHub
Browse files

[ci] [python-package] use ruff, enforce flake8-bugbear and flake8-comprehensions checks (#5871)

parent 452370ac
#!/bin/sh #!/bin/sh
echo "running flake8" echo "running ruff"
flake8 \ ruff check \
--config=./python-package/setup.cfg \
. \
|| exit -1
echo "done running flake8"
echo "running pydocstyle"
pydocstyle \
--config=./python-package/pyproject.toml \ --config=./python-package/pyproject.toml \
. \ . \
|| exit -1 || exit -1
echo "done running pydocstyle" echo "done running ruff"
echo "running isort" echo "running isort"
isort \ isort \
......
...@@ -71,11 +71,10 @@ if [[ $TASK == "lint" ]]; then ...@@ -71,11 +71,10 @@ if [[ $TASK == "lint" ]]; then
${CONDA_PYTHON_REQUIREMENT} \ ${CONDA_PYTHON_REQUIREMENT} \
cmakelint \ cmakelint \
cpplint \ cpplint \
flake8 \
isort \ isort \
mypy \ mypy \
pydocstyle \ 'r-lintr>=3.0' \
"r-lintr>=3.0" ruff
source activate $CONDA_ENV source activate $CONDA_ENV
echo "Linting Python code" echo "Linting Python code"
sh ${BUILD_DIRECTORY}/.ci/lint-python.sh || exit -1 sh ${BUILD_DIRECTORY}/.ci/lint-python.sh || exit -1
......
...@@ -2,6 +2,7 @@ ...@@ -2,6 +2,7 @@
"""Wrapper for C API of LightGBM.""" """Wrapper for C API of LightGBM."""
import abc import abc
import ctypes import ctypes
import inspect
import json import json
import warnings import warnings
from collections import OrderedDict from collections import OrderedDict
...@@ -996,8 +997,8 @@ class _InnerPredictor: ...@@ -996,8 +997,8 @@ class _InnerPredictor:
elif isinstance(data, list): elif isinstance(data, list):
try: try:
data = np.array(data) data = np.array(data)
except BaseException: except BaseException as err:
raise ValueError('Cannot convert data list to numpy array.') raise ValueError('Cannot convert data list to numpy array.') from err
preds, nrow = self.__pred_for_np2d( preds, nrow = self.__pred_for_np2d(
mat=data, mat=data,
start_iteration=start_iteration, start_iteration=start_iteration,
...@@ -1015,8 +1016,8 @@ class _InnerPredictor: ...@@ -1015,8 +1016,8 @@ class _InnerPredictor:
try: try:
_log_warning('Converting data to scipy sparse matrix.') _log_warning('Converting data to scipy sparse matrix.')
csr = scipy.sparse.csr_matrix(data) csr = scipy.sparse.csr_matrix(data)
except BaseException: except BaseException as err:
raise TypeError(f'Cannot predict data for type {type(data).__name__}') raise TypeError(f'Cannot predict data for type {type(data).__name__}') from err
preds, nrow = self.__pred_for_csr( preds, nrow = self.__pred_for_csr(
csr=csr, csr=csr,
start_iteration=start_iteration, start_iteration=start_iteration,
...@@ -1802,9 +1803,7 @@ class Dataset: ...@@ -1802,9 +1803,7 @@ class Dataset:
# process for args # process for args
params = {} if params is None else params params = {} if params is None else params
args_names = (getattr(self.__class__, '_lazy_init') args_names = inspect.signature(self.__class__._lazy_init).parameters.keys()
.__code__
.co_varnames[:getattr(self.__class__, '_lazy_init').__code__.co_argcount])
for key in params.keys(): for key in params.keys():
if key in args_names: if key in args_names:
_log_warning(f'{key} keyword has been found in `params` and will be ignored.\n' _log_warning(f'{key} keyword has been found in `params` and will be ignored.\n'
...@@ -1868,8 +1867,8 @@ class Dataset: ...@@ -1868,8 +1867,8 @@ class Dataset:
try: try:
csr = scipy.sparse.csr_matrix(data) csr = scipy.sparse.csr_matrix(data)
self.__init_from_csr(csr, params_str, ref_dataset) self.__init_from_csr(csr, params_str, ref_dataset)
except BaseException: except BaseException as err:
raise TypeError(f'Cannot initialize Dataset from {type(data).__name__}') raise TypeError(f'Cannot initialize Dataset from {type(data).__name__}') from err
if label is not None: if label is not None:
self.set_label(label) self.set_label(label)
if self.get_label() is None: if self.get_label() is None:
...@@ -1920,7 +1919,7 @@ class Dataset: ...@@ -1920,7 +1919,7 @@ class Dataset:
indices = self._create_sample_indices(total_nrow) indices = self._create_sample_indices(total_nrow)
# Select sampled rows, transpose to column order. # Select sampled rows, transpose to column order.
sampled = np.array([row for row in self._yield_row_from_seqlist(seqs, indices)]) sampled = np.array(list(self._yield_row_from_seqlist(seqs, indices)))
sampled = sampled.T sampled = sampled.T
filtered = [] filtered = []
...@@ -2777,7 +2776,7 @@ class Dataset: ...@@ -2777,7 +2776,7 @@ class Dataset:
elif isinstance(self.data, Sequence): elif isinstance(self.data, Sequence):
self.data = self.data[self.used_indices] self.data = self.data[self.used_indices]
elif isinstance(self.data, list) and len(self.data) > 0 and all(isinstance(x, Sequence) for x in self.data): elif isinstance(self.data, list) and len(self.data) > 0 and all(isinstance(x, Sequence) for x in self.data):
self.data = np.array([row for row in self._yield_row_from_seqlist(self.data, self.used_indices)]) self.data = np.array(list(self._yield_row_from_seqlist(self.data, self.used_indices)))
else: else:
_log_warning(f"Cannot subset {type(self.data).__name__} type of raw data.\n" _log_warning(f"Cannot subset {type(self.data).__name__} type of raw data.\n"
"Returning original raw data") "Returning original raw data")
......
...@@ -301,7 +301,7 @@ class _EarlyStoppingCallback: ...@@ -301,7 +301,7 @@ class _EarlyStoppingCallback:
self._reset_storages() self._reset_storages()
n_metrics = len(set(m[1] for m in env.evaluation_result_list)) n_metrics = len({m[1] for m in env.evaluation_result_list})
n_datasets = len(env.evaluation_result_list) // n_metrics n_datasets = len(env.evaluation_result_list) // n_metrics
if isinstance(self.min_delta, list): if isinstance(self.min_delta, list):
if not all(t >= 0 for t in self.min_delta): if not all(t >= 0 for t in self.min_delta):
......
...@@ -787,7 +787,7 @@ def _train( ...@@ -787,7 +787,7 @@ def _train(
else: else:
if listen_port_in_params: if listen_port_in_params:
_log_info("Using passed-in 'local_listen_port' for all workers") _log_info("Using passed-in 'local_listen_port' for all workers")
unique_hosts = set(urlparse(a).hostname for a in worker_addresses) unique_hosts = {urlparse(a).hostname for a in worker_addresses}
if len(unique_hosts) < len(worker_addresses): if len(unique_hosts) < len(worker_addresses):
msg = ( msg = (
"'local_listen_port' was provided in Dask training parameters, but at least one " "'local_listen_port' was provided in Dask training parameters, but at least one "
......
[tool.isort] [tool.isort]
line_length = 120 line_length = 120
skip_glob = [ skip_glob = [
"external_libs/*", "*/external_libs/*",
"lightgbm-python/*" "*/lightgbm-python/*"
] ]
[tool.mypy] [tool.mypy]
exclude = 'build/*|compile/*|docs/*|examples/*|external_libs/*|lightgbm-python/*|tests/*' exclude = 'build/*|compile/*|docs/*|examples/*|external_libs/*|lightgbm-python/*|tests/*'
ignore_missing_imports = true ignore_missing_imports = true
[tool.pydocstyle] [tool.ruff]
add_ignore = [ exclude = [
'D105' "build",
"compile",
"docs",
"external_libs",
"lightgbm-python",
"setup.py"
] ]
convention = 'numpy' ignore = [
match = '(?!^test_|setup).*\.py' # (pydocstyle) Missing docstring in magic method
match_dir = '^(?!^external_libs|lightgbm-python|test|example).*' "D105",
# (pycodestyle) Line too long
"E501"
]
select = [
# flake8-bugbear
"B",
# flake8-comprehensions
"C4",
# pydocstyle
"D",
# pycodestyle
"E",
# pyflakes
"F"
]
# this should be set to the oldest version of python LightGBM supports
target-version = "py37"
[tool.ruff.per-file-ignores]
"examples/*" = [
# pydocstyle
"D"
]
"tests/*" = [
# (flake8-bugbear) Found useless expression
"B018",
# pydocstyle
"D"
]
[tool.ruff.pydocstyle]
convention = "numpy"
...@@ -106,7 +106,7 @@ class DistributedMockup: ...@@ -106,7 +106,7 @@ class DistributedMockup:
for i, partition in enumerate(partitions): for i, partition in enumerate(partitions):
np.savetxt(str(TESTS_DIR / f'train{i}.txt'), partition, delimiter=',') np.savetxt(str(TESTS_DIR / f'train{i}.txt'), partition, delimiter=',')
def fit(self, partitions: List[np.ndarray], train_config: Dict = {}) -> None: def fit(self, partitions: List[np.ndarray], train_config: Dict) -> None:
"""Run the distributed training process on a single machine. """Run the distributed training process on a single machine.
For each worker i: For each worker i:
...@@ -134,7 +134,7 @@ class DistributedMockup: ...@@ -134,7 +134,7 @@ class DistributedMockup:
if result.returncode != 0: if result.returncode != 0:
raise RuntimeError('Error in training') raise RuntimeError('Error in training')
def predict(self, predict_config: Dict[str, Any] = {}) -> np.ndarray: def predict(self, predict_config: Dict[str, Any]) -> np.ndarray:
"""Compute the predictions using the model created in the fit step. """Compute the predictions using the model created in the fit step.
predict_config is used to predict the training set train.txt predict_config is used to predict the training set train.txt
...@@ -178,7 +178,7 @@ def test_classifier(executable): ...@@ -178,7 +178,7 @@ def test_classifier(executable):
} }
clf = DistributedMockup(executable) clf = DistributedMockup(executable)
clf.fit(partitions, train_params) clf.fit(partitions, train_params)
y_probas = clf.predict() y_probas = clf.predict(predict_config={})
y_pred = y_probas > 0.5 y_pred = y_probas > 0.5
assert accuracy_score(clf.label_, y_pred) == 1. assert accuracy_score(clf.label_, y_pred) == 1.
...@@ -194,5 +194,5 @@ def test_regressor(executable): ...@@ -194,5 +194,5 @@ def test_regressor(executable):
} }
reg = DistributedMockup(executable) reg = DistributedMockup(executable)
reg.fit(partitions, train_params) reg.fit(partitions, train_params)
y_pred = reg.predict() y_pred = reg.predict(predict_config={})
np.testing.assert_allclose(y_pred, reg.label_, rtol=0.2, atol=50.) np.testing.assert_allclose(y_pred, reg.label_, rtol=0.2, atol=50.)
...@@ -2,6 +2,7 @@ ...@@ -2,6 +2,7 @@
import filecmp import filecmp
import numbers import numbers
import re import re
from copy import deepcopy
from os import getenv from os import getenv
from pathlib import Path from pathlib import Path
...@@ -324,7 +325,7 @@ def test_add_features_same_booster_behaviour(tmp_path): ...@@ -324,7 +325,7 @@ def test_add_features_same_booster_behaviour(tmp_path):
d.set_label(y) d.set_label(y)
b1 = lgb.Booster(train_set=d1) b1 = lgb.Booster(train_set=d1)
b = lgb.Booster(train_set=d) b = lgb.Booster(train_set=d)
for k in range(10): for _ in range(10):
b.update() b.update()
b1.update() b1.update()
dname = tmp_path / "d.txt" dname = tmp_path / "d.txt"
...@@ -365,7 +366,7 @@ def test_add_features_from_different_sources(): ...@@ -365,7 +366,7 @@ def test_add_features_from_different_sources():
# test that method works for different data types # test that method works for different data types
d1 = lgb.Dataset(x_1, feature_name=names, free_raw_data=False).construct() d1 = lgb.Dataset(x_1, feature_name=names, free_raw_data=False).construct()
res_feature_names = [name for name in names] res_feature_names = deepcopy(names)
for idx, x_2 in enumerate(xxs, 2): for idx, x_2 in enumerate(xxs, 2):
original_type = type(d1.get_data()) original_type = type(d1.get_data())
d2 = lgb.Dataset(x_2, feature_name=names, free_raw_data=False).construct() d2 = lgb.Dataset(x_2, feature_name=names, free_raw_data=False).construct()
...@@ -407,7 +408,7 @@ def test_cegb_affects_behavior(tmp_path): ...@@ -407,7 +408,7 @@ def test_cegb_affects_behavior(tmp_path):
ds = lgb.Dataset(X, feature_name=names).construct() ds = lgb.Dataset(X, feature_name=names).construct()
ds.set_label(y) ds.set_label(y)
base = lgb.Booster(train_set=ds) base = lgb.Booster(train_set=ds)
for k in range(10): for _ in range(10):
base.update() base.update()
basename = tmp_path / "basename.txt" basename = tmp_path / "basename.txt"
base.save_model(basename) base.save_model(basename)
...@@ -419,7 +420,7 @@ def test_cegb_affects_behavior(tmp_path): ...@@ -419,7 +420,7 @@ def test_cegb_affects_behavior(tmp_path):
{'cegb_penalty_split': 1}] {'cegb_penalty_split': 1}]
for case in cases: for case in cases:
booster = lgb.Booster(train_set=ds, params=case) booster = lgb.Booster(train_set=ds, params=case)
for k in range(10): for _ in range(10):
booster.update() booster.update()
casename = tmp_path / "casename.txt" casename = tmp_path / "casename.txt"
booster.save_model(casename) booster.save_model(casename)
...@@ -445,7 +446,7 @@ def test_cegb_scaling_equalities(tmp_path): ...@@ -445,7 +446,7 @@ def test_cegb_scaling_equalities(tmp_path):
for (p1, p2) in pairs: for (p1, p2) in pairs:
booster1 = lgb.Booster(train_set=ds, params=p1) booster1 = lgb.Booster(train_set=ds, params=p1)
booster2 = lgb.Booster(train_set=ds, params=p2) booster2 = lgb.Booster(train_set=ds, params=p2)
for k in range(10): for _ in range(10):
booster1.update() booster1.update()
booster2.update() booster2.update()
p1name = tmp_path / "p1.txt" p1name = tmp_path / "p1.txt"
...@@ -752,10 +753,10 @@ def test_feature_num_bin(min_data_in_bin): ...@@ -752,10 +753,10 @@ def test_feature_num_bin(min_data_in_bin):
]).T ]).T
n_continuous = X.shape[1] - 1 n_continuous = X.shape[1] - 1
feature_name = [f'x{i}' for i in range(n_continuous)] + ['cat1'] feature_name = [f'x{i}' for i in range(n_continuous)] + ['cat1']
ds_kwargs = dict( ds_kwargs = {
params={'min_data_in_bin': min_data_in_bin}, "params": {'min_data_in_bin': min_data_in_bin},
categorical_feature=[n_continuous], # last feature "categorical_feature": [n_continuous], # last feature
) }
ds = lgb.Dataset(X, feature_name=feature_name, **ds_kwargs).construct() ds = lgb.Dataset(X, feature_name=feature_name, **ds_kwargs).construct()
expected_num_bins = [ expected_num_bins = [
100 // min_data_in_bin + 1, # extra bin for zero 100 // min_data_in_bin + 1, # extra bin for zero
......
...@@ -1062,9 +1062,9 @@ def test_eval_set_no_early_stopping(task, output, eval_sizes, eval_names_prefix, ...@@ -1062,9 +1062,9 @@ def test_eval_set_no_early_stopping(task, output, eval_sizes, eval_names_prefix,
eval_class_weight.append({0: n_neg / n_pos, 1: n_pos / n_neg}) eval_class_weight.append({0: n_neg / n_pos, 1: n_pos / n_neg})
init_score_value = np.log(np.mean(y_e) / (1 - np.mean(y_e))) init_score_value = np.log(np.mean(y_e) / (1 - np.mean(y_e)))
if 'dataframe' in output: if 'dataframe' in output:
d_init_score = dy_e.map_partitions(lambda x: pd.Series([init_score_value] * x.size)) d_init_score = dy_e.map_partitions(lambda x, val=init_score_value: pd.Series([val] * x.size))
else: else:
d_init_score = dy_e.map_blocks(lambda x: np.repeat(init_score_value, x.size)) d_init_score = dy_e.map_blocks(lambda x, val=init_score_value: np.repeat(val, x.size))
eval_init_score.append(d_init_score) eval_init_score.append(d_init_score)
......
...@@ -886,13 +886,13 @@ def test_early_stopping_min_delta(first_only, single_metric, greater_is_better): ...@@ -886,13 +886,13 @@ def test_early_stopping_min_delta(first_only, single_metric, greater_is_better):
min_delta = metric2min_delta[metric[0]] min_delta = metric2min_delta[metric[0]]
else: else:
min_delta = [metric2min_delta[m] for m in metric] min_delta = [metric2min_delta[m] for m in metric]
train_kwargs = dict( train_kwargs = {
params=params, "params": params,
train_set=train_ds, "train_set": train_ds,
num_boost_round=50, "num_boost_round": 50,
valid_sets=[train_ds, valid_ds], "valid_sets": [train_ds, valid_ds],
valid_names=['training', 'valid'], "valid_names": ['training', 'valid'],
) }
# regular early stopping # regular early stopping
evals_result = {} evals_result = {}
...@@ -1771,7 +1771,7 @@ def test_monotone_constraints(test_with_categorical_variable): ...@@ -1771,7 +1771,7 @@ def test_monotone_constraints(test_with_categorical_variable):
for tree in tree_str: for tree in tree_str:
# split_features are in 4th line. # split_features are in 4th line.
features = tree.splitlines()[3].split("=")[1].split(" ") features = tree.splitlines()[3].split("=")[1].split(" ")
features = set(f"Column_{f}" for f in features) features = {f"Column_{f}" for f in features}
feature_sets.append(features) feature_sets.append(features)
return np.array(feature_sets) return np.array(feature_sets)
...@@ -2860,14 +2860,14 @@ def test_early_stopping_for_only_first_metric(): ...@@ -2860,14 +2860,14 @@ def test_early_stopping_for_only_first_metric():
iter_valid1_l2 = 3 iter_valid1_l2 = 3
iter_valid2_l1 = 3 iter_valid2_l1 = 3
iter_valid2_l2 = 15 iter_valid2_l2 = 15
assert len(set([iter_valid1_l1, iter_valid1_l2, iter_valid2_l1, iter_valid2_l2])) == 2 assert len({iter_valid1_l1, iter_valid1_l2, iter_valid2_l1, iter_valid2_l2}) == 2
iter_min_l1 = min([iter_valid1_l1, iter_valid2_l1]) iter_min_l1 = min([iter_valid1_l1, iter_valid2_l1])
iter_min_l2 = min([iter_valid1_l2, iter_valid2_l2]) iter_min_l2 = min([iter_valid1_l2, iter_valid2_l2])
iter_min_valid1 = min([iter_valid1_l1, iter_valid1_l2]) iter_min_valid1 = min([iter_valid1_l1, iter_valid1_l2])
iter_cv_l1 = 15 iter_cv_l1 = 15
iter_cv_l2 = 13 iter_cv_l2 = 13
assert len(set([iter_cv_l1, iter_cv_l2])) == 2 assert len({iter_cv_l1, iter_cv_l2}) == 2
iter_cv_min = min([iter_cv_l1, iter_cv_l2]) iter_cv_min = min([iter_cv_l1, iter_cv_l2])
# test for lgb.train # test for lgb.train
......
...@@ -313,20 +313,24 @@ def test_grid_search(): ...@@ -313,20 +313,24 @@ def test_grid_search():
y = y.astype(str) # utilize label encoder at it's max power y = y.astype(str) # utilize label encoder at it's max power
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42) X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42)
params = dict(subsample=0.8, params = {
subsample_freq=1) "subsample": 0.8,
grid_params = dict(boosting_type=['rf', 'gbdt'], "subsample_freq": 1
n_estimators=[4, 6], }
reg_alpha=[0.01, 0.005]) grid_params = {
"boosting_type": ['rf', 'gbdt'],
"n_estimators": [4, 6],
"reg_alpha": [0.01, 0.005]
}
evals_result = {} evals_result = {}
fit_params = dict( fit_params = {
eval_set=[(X_val, y_val)], "eval_set": [(X_val, y_val)],
eval_metric=constant_metric, "eval_metric": constant_metric,
callbacks=[ "callbacks": [
lgb.early_stopping(2), lgb.early_stopping(2),
lgb.record_evaluation(evals_result) lgb.record_evaluation(evals_result)
] ]
) }
grid = GridSearchCV(estimator=lgb.LGBMClassifier(**params), param_grid=grid_params, cv=2) grid = GridSearchCV(estimator=lgb.LGBMClassifier(**params), param_grid=grid_params, cv=2)
grid.fit(X_train, y_train, **fit_params) grid.fit(X_train, y_train, **fit_params)
score = grid.score(X_test, y_test) # utilizes GridSearchCV default refit=True score = grid.score(X_test, y_test) # utilizes GridSearchCV default refit=True
...@@ -350,14 +354,20 @@ def test_random_search(): ...@@ -350,14 +354,20 @@ def test_random_search():
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1,
random_state=42) random_state=42)
n_iter = 3 # Number of samples n_iter = 3 # Number of samples
params = dict(subsample=0.8, params = {
subsample_freq=1) "subsample": 0.8,
param_dist = dict(boosting_type=['rf', 'gbdt'], "subsample_freq": 1
n_estimators=[np.random.randint(low=3, high=10) for i in range(n_iter)], }
reg_alpha=[np.random.uniform(low=0.01, high=0.06) for i in range(n_iter)]) param_dist = {
fit_params = dict(eval_set=[(X_val, y_val)], "boosting_type": ['rf', 'gbdt'],
eval_metric=constant_metric, "n_estimators": [np.random.randint(low=3, high=10) for i in range(n_iter)],
callbacks=[lgb.early_stopping(2)]) "reg_alpha": [np.random.uniform(low=0.01, high=0.06) for i in range(n_iter)]
}
fit_params = {
"eval_set": [(X_val, y_val)],
"eval_metric": constant_metric,
"callbacks": [lgb.early_stopping(2)]
}
rand = RandomizedSearchCV(estimator=lgb.LGBMClassifier(**params), rand = RandomizedSearchCV(estimator=lgb.LGBMClassifier(**params),
param_distributions=param_dist, cv=2, param_distributions=param_dist, cv=2,
n_iter=n_iter, random_state=42) n_iter=n_iter, random_state=42)
...@@ -1139,7 +1149,7 @@ def test_first_metric_only(): ...@@ -1139,7 +1149,7 @@ def test_first_metric_only():
iter_valid1_l2 = 4 iter_valid1_l2 = 4
iter_valid2_l1 = 2 iter_valid2_l1 = 2
iter_valid2_l2 = 2 iter_valid2_l2 = 2
assert len(set([iter_valid1_l1, iter_valid1_l2, iter_valid2_l1, iter_valid2_l2])) == 2 assert len({iter_valid1_l1, iter_valid1_l2, iter_valid2_l1, iter_valid2_l2}) == 2
iter_min_l1 = min([iter_valid1_l1, iter_valid2_l1]) iter_min_l1 = min([iter_valid1_l1, iter_valid2_l1])
iter_min_l2 = min([iter_valid1_l2, iter_valid2_l2]) iter_min_l2 = min([iter_valid1_l2, iter_valid2_l2])
iter_min = min([iter_min_l1, iter_min_l2]) iter_min = min([iter_min_l1, iter_min_l2])
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment