Unverified Commit d47006f4 authored by James Lamb's avatar James Lamb Committed by GitHub
Browse files

[ci] [python-package] use ruff, enforce flake8-bugbear and flake8-comprehensions checks (#5871)

parent 452370ac
#!/bin/sh
echo "running flake8"
flake8 \
--config=./python-package/setup.cfg \
. \
|| exit -1
echo "done running flake8"
echo "running pydocstyle"
pydocstyle \
echo "running ruff"
ruff check \
--config=./python-package/pyproject.toml \
. \
|| exit -1
echo "done running pydocstyle"
echo "done running ruff"
echo "running isort"
isort \
......
......@@ -71,11 +71,10 @@ if [[ $TASK == "lint" ]]; then
${CONDA_PYTHON_REQUIREMENT} \
cmakelint \
cpplint \
flake8 \
isort \
mypy \
pydocstyle \
"r-lintr>=3.0"
'r-lintr>=3.0' \
ruff
source activate $CONDA_ENV
echo "Linting Python code"
sh ${BUILD_DIRECTORY}/.ci/lint-python.sh || exit -1
......
......@@ -2,6 +2,7 @@
"""Wrapper for C API of LightGBM."""
import abc
import ctypes
import inspect
import json
import warnings
from collections import OrderedDict
......@@ -996,8 +997,8 @@ class _InnerPredictor:
elif isinstance(data, list):
try:
data = np.array(data)
except BaseException:
raise ValueError('Cannot convert data list to numpy array.')
except BaseException as err:
raise ValueError('Cannot convert data list to numpy array.') from err
preds, nrow = self.__pred_for_np2d(
mat=data,
start_iteration=start_iteration,
......@@ -1015,8 +1016,8 @@ class _InnerPredictor:
try:
_log_warning('Converting data to scipy sparse matrix.')
csr = scipy.sparse.csr_matrix(data)
except BaseException:
raise TypeError(f'Cannot predict data for type {type(data).__name__}')
except BaseException as err:
raise TypeError(f'Cannot predict data for type {type(data).__name__}') from err
preds, nrow = self.__pred_for_csr(
csr=csr,
start_iteration=start_iteration,
......@@ -1802,9 +1803,7 @@ class Dataset:
# process for args
params = {} if params is None else params
args_names = (getattr(self.__class__, '_lazy_init')
.__code__
.co_varnames[:getattr(self.__class__, '_lazy_init').__code__.co_argcount])
args_names = inspect.signature(self.__class__._lazy_init).parameters.keys()
for key in params.keys():
if key in args_names:
_log_warning(f'{key} keyword has been found in `params` and will be ignored.\n'
......@@ -1868,8 +1867,8 @@ class Dataset:
try:
csr = scipy.sparse.csr_matrix(data)
self.__init_from_csr(csr, params_str, ref_dataset)
except BaseException:
raise TypeError(f'Cannot initialize Dataset from {type(data).__name__}')
except BaseException as err:
raise TypeError(f'Cannot initialize Dataset from {type(data).__name__}') from err
if label is not None:
self.set_label(label)
if self.get_label() is None:
......@@ -1920,7 +1919,7 @@ class Dataset:
indices = self._create_sample_indices(total_nrow)
# Select sampled rows, transpose to column order.
sampled = np.array([row for row in self._yield_row_from_seqlist(seqs, indices)])
sampled = np.array(list(self._yield_row_from_seqlist(seqs, indices)))
sampled = sampled.T
filtered = []
......@@ -2777,7 +2776,7 @@ class Dataset:
elif isinstance(self.data, Sequence):
self.data = self.data[self.used_indices]
elif isinstance(self.data, list) and len(self.data) > 0 and all(isinstance(x, Sequence) for x in self.data):
self.data = np.array([row for row in self._yield_row_from_seqlist(self.data, self.used_indices)])
self.data = np.array(list(self._yield_row_from_seqlist(self.data, self.used_indices)))
else:
_log_warning(f"Cannot subset {type(self.data).__name__} type of raw data.\n"
"Returning original raw data")
......
......@@ -301,7 +301,7 @@ class _EarlyStoppingCallback:
self._reset_storages()
n_metrics = len(set(m[1] for m in env.evaluation_result_list))
n_metrics = len({m[1] for m in env.evaluation_result_list})
n_datasets = len(env.evaluation_result_list) // n_metrics
if isinstance(self.min_delta, list):
if not all(t >= 0 for t in self.min_delta):
......
......@@ -787,7 +787,7 @@ def _train(
else:
if listen_port_in_params:
_log_info("Using passed-in 'local_listen_port' for all workers")
unique_hosts = set(urlparse(a).hostname for a in worker_addresses)
unique_hosts = {urlparse(a).hostname for a in worker_addresses}
if len(unique_hosts) < len(worker_addresses):
msg = (
"'local_listen_port' was provided in Dask training parameters, but at least one "
......
[tool.isort]
line_length = 120
skip_glob = [
"external_libs/*",
"lightgbm-python/*"
"*/external_libs/*",
"*/lightgbm-python/*"
]
[tool.mypy]
exclude = 'build/*|compile/*|docs/*|examples/*|external_libs/*|lightgbm-python/*|tests/*'
ignore_missing_imports = true
[tool.pydocstyle]
add_ignore = [
'D105'
[tool.ruff]
exclude = [
"build",
"compile",
"docs",
"external_libs",
"lightgbm-python",
"setup.py"
]
convention = 'numpy'
match = '(?!^test_|setup).*\.py'
match_dir = '^(?!^external_libs|lightgbm-python|test|example).*'
ignore = [
# (pydocstyle) Missing docstring in magic method
"D105",
# (pycodestyle) Line too long
"E501"
]
select = [
# flake8-bugbear
"B",
# flake8-comprehensions
"C4",
# pydocstyle
"D",
# pycodestyle
"E",
# pyflakes
"F"
]
# this should be set to the oldest version of python LightGBM supports
target-version = "py37"
[tool.ruff.per-file-ignores]
"examples/*" = [
# pydocstyle
"D"
]
"tests/*" = [
# (flake8-bugbear) Found useless expression
"B018",
# pydocstyle
"D"
]
[tool.ruff.pydocstyle]
convention = "numpy"
......@@ -106,7 +106,7 @@ class DistributedMockup:
for i, partition in enumerate(partitions):
np.savetxt(str(TESTS_DIR / f'train{i}.txt'), partition, delimiter=',')
def fit(self, partitions: List[np.ndarray], train_config: Dict = {}) -> None:
def fit(self, partitions: List[np.ndarray], train_config: Dict) -> None:
"""Run the distributed training process on a single machine.
For each worker i:
......@@ -134,7 +134,7 @@ class DistributedMockup:
if result.returncode != 0:
raise RuntimeError('Error in training')
def predict(self, predict_config: Dict[str, Any] = {}) -> np.ndarray:
def predict(self, predict_config: Dict[str, Any]) -> np.ndarray:
"""Compute the predictions using the model created in the fit step.
predict_config is used to predict the training set train.txt
......@@ -178,7 +178,7 @@ def test_classifier(executable):
}
clf = DistributedMockup(executable)
clf.fit(partitions, train_params)
y_probas = clf.predict()
y_probas = clf.predict(predict_config={})
y_pred = y_probas > 0.5
assert accuracy_score(clf.label_, y_pred) == 1.
......@@ -194,5 +194,5 @@ def test_regressor(executable):
}
reg = DistributedMockup(executable)
reg.fit(partitions, train_params)
y_pred = reg.predict()
y_pred = reg.predict(predict_config={})
np.testing.assert_allclose(y_pred, reg.label_, rtol=0.2, atol=50.)
......@@ -2,6 +2,7 @@
import filecmp
import numbers
import re
from copy import deepcopy
from os import getenv
from pathlib import Path
......@@ -324,7 +325,7 @@ def test_add_features_same_booster_behaviour(tmp_path):
d.set_label(y)
b1 = lgb.Booster(train_set=d1)
b = lgb.Booster(train_set=d)
for k in range(10):
for _ in range(10):
b.update()
b1.update()
dname = tmp_path / "d.txt"
......@@ -365,7 +366,7 @@ def test_add_features_from_different_sources():
# test that method works for different data types
d1 = lgb.Dataset(x_1, feature_name=names, free_raw_data=False).construct()
res_feature_names = [name for name in names]
res_feature_names = deepcopy(names)
for idx, x_2 in enumerate(xxs, 2):
original_type = type(d1.get_data())
d2 = lgb.Dataset(x_2, feature_name=names, free_raw_data=False).construct()
......@@ -407,7 +408,7 @@ def test_cegb_affects_behavior(tmp_path):
ds = lgb.Dataset(X, feature_name=names).construct()
ds.set_label(y)
base = lgb.Booster(train_set=ds)
for k in range(10):
for _ in range(10):
base.update()
basename = tmp_path / "basename.txt"
base.save_model(basename)
......@@ -419,7 +420,7 @@ def test_cegb_affects_behavior(tmp_path):
{'cegb_penalty_split': 1}]
for case in cases:
booster = lgb.Booster(train_set=ds, params=case)
for k in range(10):
for _ in range(10):
booster.update()
casename = tmp_path / "casename.txt"
booster.save_model(casename)
......@@ -445,7 +446,7 @@ def test_cegb_scaling_equalities(tmp_path):
for (p1, p2) in pairs:
booster1 = lgb.Booster(train_set=ds, params=p1)
booster2 = lgb.Booster(train_set=ds, params=p2)
for k in range(10):
for _ in range(10):
booster1.update()
booster2.update()
p1name = tmp_path / "p1.txt"
......@@ -752,10 +753,10 @@ def test_feature_num_bin(min_data_in_bin):
]).T
n_continuous = X.shape[1] - 1
feature_name = [f'x{i}' for i in range(n_continuous)] + ['cat1']
ds_kwargs = dict(
params={'min_data_in_bin': min_data_in_bin},
categorical_feature=[n_continuous], # last feature
)
ds_kwargs = {
"params": {'min_data_in_bin': min_data_in_bin},
"categorical_feature": [n_continuous], # last feature
}
ds = lgb.Dataset(X, feature_name=feature_name, **ds_kwargs).construct()
expected_num_bins = [
100 // min_data_in_bin + 1, # extra bin for zero
......
......@@ -1062,9 +1062,9 @@ def test_eval_set_no_early_stopping(task, output, eval_sizes, eval_names_prefix,
eval_class_weight.append({0: n_neg / n_pos, 1: n_pos / n_neg})
init_score_value = np.log(np.mean(y_e) / (1 - np.mean(y_e)))
if 'dataframe' in output:
d_init_score = dy_e.map_partitions(lambda x: pd.Series([init_score_value] * x.size))
d_init_score = dy_e.map_partitions(lambda x, val=init_score_value: pd.Series([val] * x.size))
else:
d_init_score = dy_e.map_blocks(lambda x: np.repeat(init_score_value, x.size))
d_init_score = dy_e.map_blocks(lambda x, val=init_score_value: np.repeat(val, x.size))
eval_init_score.append(d_init_score)
......
......@@ -886,13 +886,13 @@ def test_early_stopping_min_delta(first_only, single_metric, greater_is_better):
min_delta = metric2min_delta[metric[0]]
else:
min_delta = [metric2min_delta[m] for m in metric]
train_kwargs = dict(
params=params,
train_set=train_ds,
num_boost_round=50,
valid_sets=[train_ds, valid_ds],
valid_names=['training', 'valid'],
)
train_kwargs = {
"params": params,
"train_set": train_ds,
"num_boost_round": 50,
"valid_sets": [train_ds, valid_ds],
"valid_names": ['training', 'valid'],
}
# regular early stopping
evals_result = {}
......@@ -1771,7 +1771,7 @@ def test_monotone_constraints(test_with_categorical_variable):
for tree in tree_str:
# split_features are in 4th line.
features = tree.splitlines()[3].split("=")[1].split(" ")
features = set(f"Column_{f}" for f in features)
features = {f"Column_{f}" for f in features}
feature_sets.append(features)
return np.array(feature_sets)
......@@ -2860,14 +2860,14 @@ def test_early_stopping_for_only_first_metric():
iter_valid1_l2 = 3
iter_valid2_l1 = 3
iter_valid2_l2 = 15
assert len(set([iter_valid1_l1, iter_valid1_l2, iter_valid2_l1, iter_valid2_l2])) == 2
assert len({iter_valid1_l1, iter_valid1_l2, iter_valid2_l1, iter_valid2_l2}) == 2
iter_min_l1 = min([iter_valid1_l1, iter_valid2_l1])
iter_min_l2 = min([iter_valid1_l2, iter_valid2_l2])
iter_min_valid1 = min([iter_valid1_l1, iter_valid1_l2])
iter_cv_l1 = 15
iter_cv_l2 = 13
assert len(set([iter_cv_l1, iter_cv_l2])) == 2
assert len({iter_cv_l1, iter_cv_l2}) == 2
iter_cv_min = min([iter_cv_l1, iter_cv_l2])
# test for lgb.train
......
......@@ -313,20 +313,24 @@ def test_grid_search():
y = y.astype(str) # utilize label encoder at it's max power
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42)
params = dict(subsample=0.8,
subsample_freq=1)
grid_params = dict(boosting_type=['rf', 'gbdt'],
n_estimators=[4, 6],
reg_alpha=[0.01, 0.005])
params = {
"subsample": 0.8,
"subsample_freq": 1
}
grid_params = {
"boosting_type": ['rf', 'gbdt'],
"n_estimators": [4, 6],
"reg_alpha": [0.01, 0.005]
}
evals_result = {}
fit_params = dict(
eval_set=[(X_val, y_val)],
eval_metric=constant_metric,
callbacks=[
fit_params = {
"eval_set": [(X_val, y_val)],
"eval_metric": constant_metric,
"callbacks": [
lgb.early_stopping(2),
lgb.record_evaluation(evals_result)
]
)
}
grid = GridSearchCV(estimator=lgb.LGBMClassifier(**params), param_grid=grid_params, cv=2)
grid.fit(X_train, y_train, **fit_params)
score = grid.score(X_test, y_test) # utilizes GridSearchCV default refit=True
......@@ -350,14 +354,20 @@ def test_random_search():
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1,
random_state=42)
n_iter = 3 # Number of samples
params = dict(subsample=0.8,
subsample_freq=1)
param_dist = dict(boosting_type=['rf', 'gbdt'],
n_estimators=[np.random.randint(low=3, high=10) for i in range(n_iter)],
reg_alpha=[np.random.uniform(low=0.01, high=0.06) for i in range(n_iter)])
fit_params = dict(eval_set=[(X_val, y_val)],
eval_metric=constant_metric,
callbacks=[lgb.early_stopping(2)])
params = {
"subsample": 0.8,
"subsample_freq": 1
}
param_dist = {
"boosting_type": ['rf', 'gbdt'],
"n_estimators": [np.random.randint(low=3, high=10) for i in range(n_iter)],
"reg_alpha": [np.random.uniform(low=0.01, high=0.06) for i in range(n_iter)]
}
fit_params = {
"eval_set": [(X_val, y_val)],
"eval_metric": constant_metric,
"callbacks": [lgb.early_stopping(2)]
}
rand = RandomizedSearchCV(estimator=lgb.LGBMClassifier(**params),
param_distributions=param_dist, cv=2,
n_iter=n_iter, random_state=42)
......@@ -1139,7 +1149,7 @@ def test_first_metric_only():
iter_valid1_l2 = 4
iter_valid2_l1 = 2
iter_valid2_l2 = 2
assert len(set([iter_valid1_l1, iter_valid1_l2, iter_valid2_l1, iter_valid2_l2])) == 2
assert len({iter_valid1_l1, iter_valid1_l2, iter_valid2_l1, iter_valid2_l2}) == 2
iter_min_l1 = min([iter_valid1_l1, iter_valid2_l1])
iter_min_l2 = min([iter_valid1_l2, iter_valid2_l2])
iter_min = min([iter_min_l1, iter_min_l2])
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment