Unverified Commit f3ea1ad7 authored by david-cortes's avatar david-cortes Committed by GitHub
Browse files

[python-package] Use scikit-learn interpretation of negative `n_jobs` and...


[python-package] Use scikit-learn interpretation of negative `n_jobs` and change default to number of cores (#5105)

* use joblib formula for negative n_jobs

* correction for n_jobs calculation

* use more robust cpu_count from joblib

* change default n_jobs to number of cores

* fix detection of num_threads under parameters

* better handling of n_jobs at prediction time

* fix incorrect usage of list.pop

* correct pop/remove yet again

* Update python-package/lightgbm/sklearn.py
Co-authored-by: default avatarNikita Titov <nekit94-08@mail.ru>

* Update tests/python_package_test/test_sklearn.py
Co-authored-by: default avatarNikita Titov <nekit94-08@mail.ru>

* Update tests/python_package_test/test_sklearn.py
Co-authored-by: default avatarNikita Titov <nekit94-08@mail.ru>

* add comments clarifying negative n_jobs

* fix CI (code taken from PR comment)

* change default to n_jobs=None in dask interface

* corrections for handling of n_jobs

* linter

* corrections for predict-time n_jobs

* linter

* add more comments about n_jobs values

* linter

* more corrections

* linter

* linter

* linter

* Update python-package/lightgbm/compat.py
Co-authored-by: default avatarNikita Titov <nekit94-08@mail.ru>

* Update python-package/lightgbm/sklearn.py
Co-authored-by: default avatarNikita Titov <nekit94-08@mail.ru>

* Update python-package/lightgbm/sklearn.py
Co-authored-by: default avatarNikita Titov <nekit94-08@mail.ru>

* Update python-package/lightgbm/sklearn.py
Co-authored-by: default avatarNikita Titov <nekit94-08@mail.ru>

* Update python-package/lightgbm/sklearn.py
Co-authored-by: default avatarNikita Titov <nekit94-08@mail.ru>

* workaround for passing test about outputs with multiple threads

* Update tests/python_package_test/test_sklearn.py
Co-authored-by: default avatarNikita Titov <nekit94-08@mail.ru>

* Update tests/python_package_test/test_sklearn.py
Co-authored-by: default avatarNikita Titov <nekit94-08@mail.ru>
Co-authored-by: default avatarNikita Titov <nekit94-08@mail.ru>
parent 6f92d47a
...@@ -174,3 +174,21 @@ except ImportError: ...@@ -174,3 +174,21 @@ except ImportError:
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
pass pass
"""cpu_count()"""
try:
from joblib import cpu_count
def _LGBMCpuCount(only_physical_cores: bool = True):
return cpu_count(only_physical_cores=only_physical_cores)
except ImportError:
try:
from psutil import cpu_count
def _LGBMCpuCount(only_physical_cores: bool = True):
return cpu_count(logical=not only_physical_cores)
except ImportError:
from multiprocessing import cpu_count
def _LGBMCpuCount(only_physical_cores: bool = True):
return cpu_count()
...@@ -1111,7 +1111,7 @@ class DaskLGBMClassifier(LGBMClassifier, _DaskLGBMModel): ...@@ -1111,7 +1111,7 @@ class DaskLGBMClassifier(LGBMClassifier, _DaskLGBMModel):
reg_alpha: float = 0., reg_alpha: float = 0.,
reg_lambda: float = 0., reg_lambda: float = 0.,
random_state: Optional[Union[int, np.random.RandomState]] = None, random_state: Optional[Union[int, np.random.RandomState]] = None,
n_jobs: int = -1, n_jobs: Optional[int] = None,
importance_type: str = 'split', importance_type: str = 'split',
client: Optional[Client] = None, client: Optional[Client] = None,
**kwargs: Any **kwargs: Any
...@@ -1283,7 +1283,7 @@ class DaskLGBMRegressor(LGBMRegressor, _DaskLGBMModel): ...@@ -1283,7 +1283,7 @@ class DaskLGBMRegressor(LGBMRegressor, _DaskLGBMModel):
reg_alpha: float = 0., reg_alpha: float = 0.,
reg_lambda: float = 0., reg_lambda: float = 0.,
random_state: Optional[Union[int, np.random.RandomState]] = None, random_state: Optional[Union[int, np.random.RandomState]] = None,
n_jobs: int = -1, n_jobs: Optional[int] = None,
importance_type: str = 'split', importance_type: str = 'split',
client: Optional[Client] = None, client: Optional[Client] = None,
**kwargs: Any **kwargs: Any
...@@ -1436,7 +1436,7 @@ class DaskLGBMRanker(LGBMRanker, _DaskLGBMModel): ...@@ -1436,7 +1436,7 @@ class DaskLGBMRanker(LGBMRanker, _DaskLGBMModel):
reg_alpha: float = 0., reg_alpha: float = 0.,
reg_lambda: float = 0., reg_lambda: float = 0.,
random_state: Optional[Union[int, np.random.RandomState]] = None, random_state: Optional[Union[int, np.random.RandomState]] = None,
n_jobs: int = -1, n_jobs: Optional[int] = None,
importance_type: str = 'split', importance_type: str = 'split',
client: Optional[Client] = None, client: Optional[Client] = None,
**kwargs: Any **kwargs: Any
......
...@@ -10,8 +10,8 @@ from .basic import Booster, Dataset, LightGBMError, _choose_param_value, _Config ...@@ -10,8 +10,8 @@ from .basic import Booster, Dataset, LightGBMError, _choose_param_value, _Config
from .callback import record_evaluation from .callback import record_evaluation
from .compat import (SKLEARN_INSTALLED, LGBMNotFittedError, _LGBMAssertAllFinite, _LGBMCheckArray, from .compat import (SKLEARN_INSTALLED, LGBMNotFittedError, _LGBMAssertAllFinite, _LGBMCheckArray,
_LGBMCheckClassificationTargets, _LGBMCheckSampleWeight, _LGBMCheckXY, _LGBMClassifierBase, _LGBMCheckClassificationTargets, _LGBMCheckSampleWeight, _LGBMCheckXY, _LGBMClassifierBase,
_LGBMComputeSampleWeight, _LGBMLabelEncoder, _LGBMModelBase, _LGBMRegressorBase, dt_DataTable, _LGBMComputeSampleWeight, _LGBMCpuCount, _LGBMLabelEncoder, _LGBMModelBase, _LGBMRegressorBase,
pd_DataFrame) dt_DataTable, pd_DataFrame)
from .engine import train from .engine import train
_EvalResultType = Tuple[str, float, bool] _EvalResultType = Tuple[str, float, bool]
...@@ -362,7 +362,7 @@ class LGBMModel(_LGBMModelBase): ...@@ -362,7 +362,7 @@ class LGBMModel(_LGBMModelBase):
reg_alpha: float = 0., reg_alpha: float = 0.,
reg_lambda: float = 0., reg_lambda: float = 0.,
random_state: Optional[Union[int, np.random.RandomState]] = None, random_state: Optional[Union[int, np.random.RandomState]] = None,
n_jobs: int = -1, n_jobs: Optional[int] = None,
importance_type: str = 'split', importance_type: str = 'split',
**kwargs **kwargs
): ):
...@@ -425,8 +425,18 @@ class LGBMModel(_LGBMModelBase): ...@@ -425,8 +425,18 @@ class LGBMModel(_LGBMModelBase):
If int, this number is used to seed the C++ code. If int, this number is used to seed the C++ code.
If RandomState object (numpy), a random integer is picked based on its state to seed the C++ code. If RandomState object (numpy), a random integer is picked based on its state to seed the C++ code.
If None, default seeds in C++ code are used. If None, default seeds in C++ code are used.
n_jobs : int, optional (default=-1) n_jobs : int or None, optional (default=None)
Number of parallel threads to use for training (can be changed at prediction time). Number of parallel threads to use for training (can be changed at prediction time by
passing it as an extra keyword argument).
For better performance, it is recommended to set this to the number of physical cores
in the CPU.
Negative integers are interpreted as following joblib's formula (n_cpus + 1 + n_jobs), just like
scikit-learn (so e.g. -1 means using all threads). A value of zero corresponds the default number of
threads configured for OpenMP in the system. A value of ``None`` (the default) corresponds
to using the number of physical cores in the system (its correct detection requires
either the ``joblib`` or the ``psutil`` util libraries to be installed).
importance_type : str, optional (default='split') importance_type : str, optional (default='split')
The type of feature importance to be filled into ``feature_importances_``. The type of feature importance to be filled into ``feature_importances_``.
If 'split', result contains numbers of times the feature is used in a model. If 'split', result contains numbers of times the feature is used in a model.
...@@ -634,8 +644,34 @@ class LGBMModel(_LGBMModelBase): ...@@ -634,8 +644,34 @@ class LGBMModel(_LGBMModelBase):
# overwrite default metric by explicitly set metric # overwrite default metric by explicitly set metric
params = _choose_param_value("metric", params, original_metric) params = _choose_param_value("metric", params, original_metric)
# use joblib conventions for negative n_jobs, just like scikit-learn
# at predict time, this is handled later due to the order of parameter updates
if stage == "fit":
params = _choose_param_value("num_threads", params, self.n_jobs)
params["num_threads"] = self._process_n_jobs(params["num_threads"])
return params return params
def _process_n_jobs(self, n_jobs: Optional[int]) -> int:
"""Convert special values of n_jobs to their actual values according to the formulas that apply.
Parameters
----------
n_jobs : int or None
The original value of n_jobs, potentially having special values such as 'None' or
negative integers.
Returns
-------
n_jobs : int
The value of n_jobs with special values converted to actual number of threads.
"""
if n_jobs is None:
n_jobs = _LGBMCpuCount(only_physical_cores=True)
elif n_jobs < 0:
n_jobs = max(_LGBMCpuCount(only_physical_cores=False) + 1 + n_jobs, 1)
return n_jobs
def fit( def fit(
self, self,
X, X,
...@@ -810,6 +846,15 @@ class LGBMModel(_LGBMModelBase): ...@@ -810,6 +846,15 @@ class LGBMModel(_LGBMModelBase):
): ):
predict_params.pop(alias, None) predict_params.pop(alias, None)
predict_params.update(kwargs) predict_params.update(kwargs)
# number of threads can have values with special meaning which is only applied
# in the scikit-learn interface, these should not reach the c++ side as-is
n_jobs = self.n_jobs
for alias in _ConfigAliases.get("num_threads"):
if alias in predict_params:
n_jobs = predict_params.pop(alias)
predict_params["num_threads"] = self._process_n_jobs(n_jobs)
return self._Booster.predict(X, raw_score=raw_score, start_iteration=start_iteration, num_iteration=num_iteration, return self._Booster.predict(X, raw_score=raw_score, start_iteration=start_iteration, num_iteration=num_iteration,
pred_leaf=pred_leaf, pred_contrib=pred_contrib, **predict_params) pred_leaf=pred_leaf, pred_contrib=pred_contrib, **predict_params)
......
...@@ -21,7 +21,7 @@ class FileLoader: ...@@ -21,7 +21,7 @@ class FileLoader:
if line and not line.startswith('#'): if line and not line.startswith('#'):
key, value = [token.strip() for token in line.split('=')] key, value = [token.strip() for token in line.split('=')]
if 'early_stopping' not in key: # disable early_stopping if 'early_stopping' not in key: # disable early_stopping
self.params[key] = value if key != 'num_trees' else int(value) self.params[key] = value if key not in {'num_trees', 'num_threads'} else int(value)
def load_dataset(self, suffix, is_sparse=False): def load_dataset(self, suffix, is_sparse=False):
filename = str(self.path(suffix)) filename = str(self.path(suffix))
...@@ -84,7 +84,7 @@ def test_binary_linear(): ...@@ -84,7 +84,7 @@ def test_binary_linear():
X_test, _, X_test_fn = fd.load_dataset('.test') X_test, _, X_test_fn = fd.load_dataset('.test')
weight_train = fd.load_field('.train.weight') weight_train = fd.load_field('.train.weight')
lgb_train = lgb.Dataset(X_train, y_train, params=fd.params, weight=weight_train) lgb_train = lgb.Dataset(X_train, y_train, params=fd.params, weight=weight_train)
gbm = lgb.LGBMClassifier(**fd.params) gbm = lgb.LGBMClassifier(**fd.params, n_jobs=0)
gbm.fit(X_train, y_train, sample_weight=weight_train) gbm.fit(X_train, y_train, sample_weight=weight_train)
sk_pred = gbm.predict_proba(X_test)[:, 1] sk_pred = gbm.predict_proba(X_test)[:, 1]
fd.train_predict_check(lgb_train, X_test, X_test_fn, sk_pred) fd.train_predict_check(lgb_train, X_test, X_test_fn, sk_pred)
......
# coding: utf-8 # coding: utf-8
import itertools import itertools
import math import math
import re
from os import getenv from os import getenv
from pathlib import Path from pathlib import Path
...@@ -1288,3 +1289,29 @@ def test_multiclass_custom_objective(): ...@@ -1288,3 +1289,29 @@ def test_multiclass_custom_objective():
np.testing.assert_allclose(builtin_obj_preds, custom_obj_preds, rtol=0.01) np.testing.assert_allclose(builtin_obj_preds, custom_obj_preds, rtol=0.01)
assert not callable(builtin_obj_model.objective_) assert not callable(builtin_obj_model.objective_)
assert callable(custom_obj_model.objective_) assert callable(custom_obj_model.objective_)
def test_negative_n_jobs(tmp_path):
n_threads = joblib.cpu_count()
if n_threads <= 1:
return None
# 'val_minus_two' here is the expected number of threads for n_jobs=-2
val_minus_two = n_threads - 1
X, y = load_breast_cancer(return_X_y=True)
# Note: according to joblib's formula, a value of n_jobs=-2 means
# "use all but one thread" (formula: n_cpus + 1 + n_jobs)
gbm = lgb.LGBMClassifier(n_estimators=2, verbose=-1, n_jobs=-2).fit(X, y)
gbm.booster_.save_model(tmp_path / "model.txt")
with open(tmp_path / "model.txt", "r") as f:
model_txt = f.read()
assert bool(re.search(rf"\[num_threads: {val_minus_two}\]", model_txt))
def test_default_n_jobs(tmp_path):
n_cores = joblib.cpu_count(only_physical_cores=True)
X, y = load_breast_cancer(return_X_y=True)
gbm = lgb.LGBMClassifier(n_estimators=2, verbose=-1, n_jobs=None).fit(X, y)
gbm.booster_.save_model(tmp_path / "model.txt")
with open(tmp_path / "model.txt", "r") as f:
model_txt = f.read()
assert bool(re.search(rf"\[num_threads: {n_cores}\]", model_txt))
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment