"src/git@developer.sourcefind.cn:tianlh/lightgbm-dcu.git" did not exist on "861de1c1f0c4cd2f3d054aac93f3c6efa172d215"
Unverified Commit 90342e92 authored by Nikita Titov's avatar Nikita Titov Committed by GitHub
Browse files

[python] allow to pass some params as pathlib.Path objects (#4440)

* allow to pass some params as pathlib.Path objects

* fix lint

* improve indentation
parent b09da434
......@@ -3,7 +3,7 @@
Contributors: https://github.com/microsoft/LightGBM/graphs/contributors.
"""
import os
from pathlib import Path
from .basic import Booster, Dataset, Sequence, register_logger
from .callback import early_stopping, print_evaluation, record_evaluation, reset_parameter
......@@ -23,11 +23,9 @@ except ImportError:
pass
dir_path = os.path.dirname(os.path.realpath(__file__))
if os.path.isfile(os.path.join(dir_path, 'VERSION.txt')):
with open(os.path.join(dir_path, 'VERSION.txt')) as version_file:
__version__ = version_file.read().strip()
_version_path = Path(__file__).parent.absolute() / 'VERSION.txt'
if _version_path.is_file():
__version__ = _version_path.read_text(encoding='utf-8').strip()
__all__ = ['Dataset', 'Booster', 'CVBooster', 'Sequence',
'register_logger',
......
......@@ -3,12 +3,14 @@
import abc
import ctypes
import json
import os
import warnings
from collections import OrderedDict
from copy import deepcopy
from functools import wraps
from logging import Logger
from os import SEEK_END
from os.path import getsize
from pathlib import Path
from tempfile import NamedTemporaryFile
from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union
......@@ -243,7 +245,7 @@ def param_dict_to_str(data):
else:
return str(x)
pairs.append(f"{key}={','.join(map(to_string, val))}")
elif isinstance(val, (str, NUMERIC_TYPES)) or is_numeric(val):
elif isinstance(val, (str, Path, NUMERIC_TYPES)) or is_numeric(val):
pairs.append(f"{key}={val}")
elif val is not None:
raise TypeError(f'Unknown type of parameter:{key}, got:{type(val).__name__}')
......@@ -251,23 +253,17 @@ def param_dict_to_str(data):
class _TempFile:
"""Proxy class to workaround errors on Windows."""
def __enter__(self):
with NamedTemporaryFile(prefix="lightgbm_tmp_", delete=True) as f:
self.name = f.name
self.path = Path(self.name)
return self
def __exit__(self, exc_type, exc_val, exc_tb):
if os.path.isfile(self.name):
os.remove(self.name)
def readlines(self):
with open(self.name, "r+") as f:
ret = f.readlines()
return ret
def writelines(self, lines):
with open(self.name, "w+") as f:
f.writelines(lines)
if self.path.is_file():
self.path.unlink()
class LightGBMError(Exception):
......@@ -584,12 +580,12 @@ def _load_pandas_categorical(file_name=None, model_str=None):
pandas_key = 'pandas_categorical:'
offset = -len(pandas_key)
if file_name is not None:
max_offset = -os.path.getsize(file_name)
max_offset = -getsize(file_name)
with open(file_name, 'rb') as f:
while True:
if offset < max_offset:
offset = max_offset
f.seek(offset, os.SEEK_END)
f.seek(offset, SEEK_END)
lines = f.readlines()
if len(lines) >= 2:
break
......@@ -685,7 +681,7 @@ class _InnerPredictor:
Parameters
----------
model_file : string or None, optional (default=None)
model_file : string, pathlib.Path or None, optional (default=None)
Path to the model file.
booster_handle : object or None, optional (default=None)
Handle of Booster.
......@@ -698,7 +694,7 @@ class _InnerPredictor:
"""Prediction task"""
out_num_iterations = ctypes.c_int(0)
_safe_call(_LIB.LGBM_BoosterCreateFromModelfile(
c_str(model_file),
c_str(str(model_file)),
ctypes.byref(out_num_iterations),
ctypes.byref(self.handle)))
out_num_class = ctypes.c_int(0)
......@@ -743,9 +739,9 @@ class _InnerPredictor:
Parameters
----------
data : string, numpy array, pandas DataFrame, H2O DataTable's Frame or scipy.sparse
data : string, pathlib.Path, numpy array, pandas DataFrame, H2O DataTable's Frame or scipy.sparse
Data source for prediction.
When data type is string, it represents the path of txt file.
When data type is string or pathlib.Path, it represents the path of txt file.
start_iteration : int, optional (default=0)
Start index of the iteration to predict.
num_iteration : int, optional (default=-1)
......@@ -780,21 +776,19 @@ class _InnerPredictor:
predict_type = C_API_PREDICT_CONTRIB
int_data_has_header = 1 if data_has_header else 0
if isinstance(data, str):
if isinstance(data, (str, Path)):
with _TempFile() as f:
_safe_call(_LIB.LGBM_BoosterPredictForFile(
self.handle,
c_str(data),
c_str(str(data)),
ctypes.c_int(int_data_has_header),
ctypes.c_int(predict_type),
ctypes.c_int(start_iteration),
ctypes.c_int(num_iteration),
c_str(self.pred_parameter),
c_str(f.name)))
lines = f.readlines()
nrow = len(lines)
preds = [float(token) for line in lines for token in line.split('\t')]
preds = np.array(preds, dtype=np.float64, copy=False)
preds = np.loadtxt(f.name, dtype=np.float64)
nrow = preds.shape[0]
elif isinstance(data, scipy.sparse.csr_matrix):
preds, nrow = self.__pred_for_csr(data, start_iteration, num_iteration, predict_type)
elif isinstance(data, scipy.sparse.csc_matrix):
......@@ -829,9 +823,9 @@ class _InnerPredictor:
def __get_num_preds(self, start_iteration, num_iteration, nrow, predict_type):
"""Get size of prediction result."""
if nrow > MAX_INT32:
raise LightGBMError('LightGBM cannot perform prediction for data'
raise LightGBMError('LightGBM cannot perform prediction for data '
f'with number of rows greater than MAX_INT32 ({MAX_INT32}).\n'
'You can split your data into chunks'
'You can split your data into chunks '
'and then concatenate predictions for them')
n_preds = ctypes.c_int64(0)
_safe_call(_LIB.LGBM_BoosterCalcNumPredict(
......@@ -1133,9 +1127,9 @@ class Dataset:
Parameters
----------
data : string, numpy array, pandas DataFrame, H2O DataTable's Frame, scipy.sparse, Sequence, list of Sequences or list of numpy arrays
data : string, pathlib.Path, numpy array, pandas DataFrame, H2O DataTable's Frame, scipy.sparse, Sequence, list of Sequences or list of numpy arrays
Data source of Dataset.
If string, it represents the path to txt file.
If string or pathlib.Path, it represents the path to txt file.
label : list, numpy 1-D array, pandas Series / one-column DataFrame or None, optional (default=None)
Label of the data.
reference : Dataset or None, optional (default=None)
......@@ -1384,7 +1378,7 @@ class Dataset:
def _set_init_score_by_predictor(self, predictor, data, used_indices=None):
data_has_header = False
if isinstance(data, str):
if isinstance(data, (str, Path)):
# check data has header or not
data_has_header = any(self.params.get(alias, False) for alias in _ConfigAliases.get("header"))
num_data = self.num_data()
......@@ -1395,7 +1389,7 @@ class Dataset:
is_reshape=False)
if used_indices is not None:
assert not self.need_slice
if isinstance(data, str):
if isinstance(data, (str, Path)):
sub_init_score = np.empty(num_data * predictor.num_class, dtype=np.float32)
assert num_data == len(used_indices)
for i in range(len(used_indices)):
......@@ -1472,10 +1466,10 @@ class Dataset:
elif reference is not None:
raise TypeError('Reference dataset should be None or dataset instance')
# start construct data
if isinstance(data, str):
if isinstance(data, (str, Path)):
self.handle = ctypes.c_void_p()
_safe_call(_LIB.LGBM_DatasetCreateFromFile(
c_str(data),
c_str(str(data)),
c_str(params_str),
ref_dataset,
ctypes.byref(self.handle)))
......@@ -1775,9 +1769,9 @@ class Dataset:
Parameters
----------
data : string, numpy array, pandas DataFrame, H2O DataTable's Frame, scipy.sparse, Sequence, list of Sequences or list of numpy arrays
data : string, pathlib.Path, numpy array, pandas DataFrame, H2O DataTable's Frame, scipy.sparse, Sequence, list of Sequences or list of numpy arrays
Data source of Dataset.
If string, it represents the path to txt file.
If string or pathlib.Path, it represents the path to txt file.
label : list, numpy 1-D array, pandas Series / one-column DataFrame or None, optional (default=None)
Label of the data.
weight : list, numpy 1-D array, pandas Series or None, optional (default=None)
......@@ -1842,7 +1836,7 @@ class Dataset:
Parameters
----------
filename : string
filename : string or pathlib.Path
Name of the output file.
Returns
......@@ -1852,7 +1846,7 @@ class Dataset:
"""
_safe_call(_LIB.LGBM_DatasetSaveBinary(
self.construct().handle,
c_str(filename)))
c_str(str(filename))))
return self
def _update_params(self, params):
......@@ -2242,7 +2236,7 @@ class Dataset:
Returns
-------
data : string, numpy array, pandas DataFrame, H2O DataTable's Frame, scipy.sparse, list of numpy arrays or None
data : string, pathlib.Path, numpy array, pandas DataFrame, H2O DataTable's Frame, scipy.sparse, list of numpy arrays or None
Raw data used in the Dataset construction.
"""
if self.handle is None:
......@@ -2442,7 +2436,7 @@ class Dataset:
Parameters
----------
filename : string
filename : string or pathlib.Path
Name of the output file.
Returns
......@@ -2452,7 +2446,7 @@ class Dataset:
"""
_safe_call(_LIB.LGBM_DatasetDumpText(
self.construct().handle,
c_str(filename)))
c_str(str(filename))))
return self
......@@ -2468,7 +2462,7 @@ class Booster:
Parameters for Booster.
train_set : Dataset or None, optional (default=None)
Training dataset.
model_file : string or None, optional (default=None)
model_file : string, pathlib.Path or None, optional (default=None)
Path to the model file.
model_str : string or None, optional (default=None)
Model will be loaded from this string.
......@@ -2561,7 +2555,7 @@ class Booster:
out_num_iterations = ctypes.c_int(0)
self.handle = ctypes.c_void_p()
_safe_call(_LIB.LGBM_BoosterCreateFromModelfile(
c_str(model_file),
c_str(str(model_file)),
ctypes.byref(out_num_iterations),
ctypes.byref(self.handle)))
out_num_class = ctypes.c_int(0)
......@@ -3200,7 +3194,7 @@ class Booster:
Parameters
----------
filename : string
filename : string or pathlib.Path
Filename to save Booster.
num_iteration : int or None, optional (default=None)
Index of the iteration that should be saved.
......@@ -3226,7 +3220,7 @@ class Booster:
ctypes.c_int(start_iteration),
ctypes.c_int(num_iteration),
ctypes.c_int(importance_type_int),
c_str(filename)))
c_str(str(filename))))
_dump_pandas_categorical(self.pandas_categorical, filename)
return self
......@@ -3400,9 +3394,9 @@ class Booster:
Parameters
----------
data : string, numpy array, pandas DataFrame, H2O DataTable's Frame or scipy.sparse
data : string, pathlib.Path, numpy array, pandas DataFrame, H2O DataTable's Frame or scipy.sparse
Data source for prediction.
If string, it represents the path to txt file.
If string or pathlib.Path, it represents the path to txt file.
start_iteration : int, optional (default=0)
Start index of the iteration to predict.
If <= 0, starts from the first iteration.
......@@ -3455,9 +3449,9 @@ class Booster:
Parameters
----------
data : string, numpy array, pandas DataFrame, H2O DataTable's Frame or scipy.sparse
data : string, pathlib.Path, numpy array, pandas DataFrame, H2O DataTable's Frame or scipy.sparse
Data source for refit.
If string, it represents the path to txt file.
If string or pathlib.Path, it represents the path to txt file.
label : list, numpy 1-D array or pandas Series / one-column DataFrame
Label for refit.
decay_rate : float, optional (default=0.9)
......
......@@ -3,6 +3,7 @@
import collections
import copy
from operator import attrgetter
from pathlib import Path
import numpy as np
......@@ -76,7 +77,7 @@ def train(params, train_set, num_boost_round=100,
If you want to get i-th row preds in j-th class, the access way is preds[j * num_data + i].
To ignore the default metric corresponding to the used objective,
set the ``metric`` parameter to the string ``"None"`` in ``params``.
init_model : string, Booster or None, optional (default=None)
init_model : string, pathlib.Path, Booster or None, optional (default=None)
Filename of LightGBM model or Booster instance used for continue training.
feature_name : list of strings or 'auto', optional (default="auto")
Feature names.
......@@ -161,7 +162,7 @@ def train(params, train_set, num_boost_round=100,
if num_boost_round <= 0:
raise ValueError("num_boost_round should be greater than zero.")
if isinstance(init_model, str):
if isinstance(init_model, (str, Path)):
predictor = _InnerPredictor(model_file=init_model, pred_parameter=params)
elif isinstance(init_model, Booster):
predictor = init_model._to_predictor(dict(init_model.params, **params))
......@@ -470,7 +471,7 @@ def cv(params, train_set, num_boost_round=100,
If you want to get i-th row preds in j-th class, the access way is preds[j * num_data + i].
To ignore the default metric corresponding to the used objective,
set ``metrics`` to the string ``"None"``.
init_model : string, Booster or None, optional (default=None)
init_model : string, pathlib.Path, Booster or None, optional (default=None)
Filename of LightGBM model or Booster instance used for continue training.
feature_name : list of strings or 'auto', optional (default="auto")
Feature names.
......@@ -545,7 +546,7 @@ def cv(params, train_set, num_boost_round=100,
if num_boost_round <= 0:
raise ValueError("num_boost_round should be greater than zero.")
if isinstance(init_model, str):
if isinstance(init_model, (str, Path)):
predictor = _InnerPredictor(model_file=init_model, pred_parameter=params)
elif isinstance(init_model, Booster):
predictor = init_model._to_predictor(dict(init_model.params, **params))
......
# coding: utf-8
"""Find the path to LightGBM dynamic library files."""
import os
from os import environ
from pathlib import Path
from platform import system
from typing import List
......@@ -13,27 +14,26 @@ def find_lib_path() -> List[str]:
lib_path: list of strings
List of all found library paths to LightGBM.
"""
if os.environ.get('LIGHTGBM_BUILD_DOC', False):
if environ.get('LIGHTGBM_BUILD_DOC', False):
# we don't need lib_lightgbm while building docs
return []
curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
curr_path = Path(__file__).parent.absolute()
dll_path = [curr_path,
os.path.join(curr_path, '../../'),
os.path.join(curr_path, 'compile'),
os.path.join(curr_path, '../compile'),
os.path.join(curr_path, '../../lib/')]
curr_path.parents[1],
curr_path / 'compile',
curr_path.parent / 'compile',
curr_path.parents[1] / 'lib']
if system() in ('Windows', 'Microsoft'):
dll_path.append(os.path.join(curr_path, '../compile/Release/'))
dll_path.append(os.path.join(curr_path, '../compile/windows/x64/DLL/'))
dll_path.append(os.path.join(curr_path, '../../Release/'))
dll_path.append(os.path.join(curr_path, '../../windows/x64/DLL/'))
dll_path = [os.path.join(p, 'lib_lightgbm.dll') for p in dll_path]
dll_path.append(curr_path.parent / 'compile' / 'Release')
dll_path.append(curr_path.parent / 'compile' / 'windows' / 'x64' / 'DLL')
dll_path.append(curr_path.parents[1] / 'Release')
dll_path.append(curr_path.parents[1] / 'windows' / 'x64' / 'DLL')
dll_path = [p / 'lib_lightgbm.dll' for p in dll_path]
else:
dll_path = [os.path.join(p, 'lib_lightgbm.so') for p in dll_path]
lib_path = [p for p in dll_path if os.path.exists(p) and os.path.isfile(p)]
dll_path = [p / 'lib_lightgbm.so' for p in dll_path]
lib_path = [str(p) for p in dll_path if p.is_file()]
if not lib_path:
dll_path = [os.path.realpath(p) for p in dll_path]
new_line = "\n"
raise Exception(f'Cannot find lightgbm library file in following paths:{new_line}{new_line.join(dll_path)}')
dll_path_joined = '\n'.join(map(str, dll_path))
raise Exception(f'Cannot find lightgbm library file in following paths:\n{dll_path_joined}')
return lib_path
......@@ -256,7 +256,7 @@ _lgbmmodel_doc_fit = (
callbacks : list of callback functions or None, optional (default=None)
List of callback functions that are applied at each iteration.
See Callbacks in Python API for more information.
init_model : string, Booster, LGBMModel or None, optional (default=None)
init_model : string, pathlib.Path, Booster, LGBMModel or None, optional (default=None)
Filename of LightGBM model, Booster instance or LGBMModel instance used for continue training.
Returns
......
......@@ -49,8 +49,8 @@ def test_basic(tmp_path):
assert bst.lower_bound() == pytest.approx(-2.9040190126976606)
assert bst.upper_bound() == pytest.approx(3.3182142872462883)
tname = str(tmp_path / "svm_light.dat")
model_file = str(tmp_path / "model.txt")
tname = tmp_path / "svm_light.dat"
model_file = tmp_path / "model.txt"
bst.save_model(model_file)
pred_from_matr = bst.predict(X_test)
......@@ -153,8 +153,8 @@ def test_sequence(tmpdir, sample_count, batch_size, include_0_and_nan, num_seq):
X = data[:, :-1]
Y = data[:, -1]
npy_bin_fname = str(tmpdir / 'data_from_npy.bin')
seq_bin_fname = str(tmpdir / 'data_from_seq.bin')
npy_bin_fname = tmpdir / 'data_from_npy.bin'
seq_bin_fname = tmpdir / 'data_from_seq.bin'
# Create dataset from numpy array directly.
ds = lgb.Dataset(X, label=Y, params=params)
......@@ -175,9 +175,9 @@ def test_sequence(tmpdir, sample_count, batch_size, include_0_and_nan, num_seq):
valid_X = valid_data[:, :-1]
valid_Y = valid_data[:, -1]
valid_npy_bin_fname = str(tmpdir / 'valid_data_from_npy.bin')
valid_seq_bin_fname = str(tmpdir / 'valid_data_from_seq.bin')
valid_seq2_bin_fname = str(tmpdir / 'valid_data_from_seq2.bin')
valid_npy_bin_fname = tmpdir / 'valid_data_from_npy.bin'
valid_seq_bin_fname = tmpdir / 'valid_data_from_seq.bin'
valid_seq2_bin_fname = tmpdir / 'valid_data_from_seq2.bin'
valid_ds = lgb.Dataset(valid_X, label=valid_Y, params=params, reference=ds)
valid_ds.save_binary(valid_npy_bin_fname)
......@@ -268,10 +268,10 @@ def test_add_features_equal_data_on_alternating_used_unused(tmp_path):
d1 = lgb.Dataset(X[:, :j], feature_name=names[:j]).construct()
d2 = lgb.Dataset(X[:, j:], feature_name=names[j:]).construct()
d1.add_features_from(d2)
d1name = str(tmp_path / "d1.txt")
d1name = tmp_path / "d1.txt"
d1._dump_text(d1name)
d = lgb.Dataset(X, feature_name=names).construct()
dname = str(tmp_path / "d.txt")
dname = tmp_path / "d.txt"
d._dump_text(dname)
with open(d1name, 'rt') as d1f:
d1txt = d1f.read()
......@@ -297,8 +297,8 @@ def test_add_features_same_booster_behaviour(tmp_path):
for k in range(10):
b.update()
b1.update()
dname = str(tmp_path / "d.txt")
d1name = str(tmp_path / "d1.txt")
dname = tmp_path / "d.txt"
d1name = tmp_path / "d1.txt"
b1.save_model(d1name)
b.save_model(dname)
with open(dname, 'rt') as df:
......@@ -352,7 +352,7 @@ def test_cegb_affects_behavior(tmp_path):
base = lgb.Booster(train_set=ds)
for k in range(10):
base.update()
basename = str(tmp_path / "basename.txt")
basename = tmp_path / "basename.txt"
base.save_model(basename)
with open(basename, 'rt') as f:
basetxt = f.read()
......@@ -364,7 +364,7 @@ def test_cegb_affects_behavior(tmp_path):
booster = lgb.Booster(train_set=ds, params=case)
for k in range(10):
booster.update()
casename = str(tmp_path / "casename.txt")
casename = tmp_path / "casename.txt"
booster.save_model(casename)
with open(casename, 'rt') as f:
casetxt = f.read()
......@@ -391,13 +391,13 @@ def test_cegb_scaling_equalities(tmp_path):
for k in range(10):
booster1.update()
booster2.update()
p1name = str(tmp_path / "p1.txt")
p1name = tmp_path / "p1.txt"
# Reset booster1's parameters to p2, so the parameter section of the file matches.
booster1.reset_parameter(p2)
booster1.save_model(p1name)
with open(p1name, 'rt') as f:
p1txt = f.read()
p2name = str(tmp_path / "p2.txt")
p2name = tmp_path / "p2.txt"
booster2.save_model(p2name)
with open(p2name, 'rt') as f:
p2txt = f.read()
......
......@@ -24,7 +24,7 @@ class FileLoader:
self.params[key] = value if key != 'num_trees' else int(value)
def load_dataset(self, suffix, is_sparse=False):
filename = self.path(suffix)
filename = str(self.path(suffix))
if is_sparse:
X, Y = load_svmlight_file(filename, dtype=np.float64, zero_based=True)
return X, Y, filename
......@@ -62,7 +62,7 @@ class FileLoader:
assert a == b, f
def path(self, suffix):
return str(self.directory / f'{self.prefix}{suffix}')
return self.directory / f'{self.prefix}{suffix}'
def test_binary():
......
......@@ -2261,7 +2261,7 @@ def test_forced_bins():
x[:, 0] = np.arange(0, 1, 0.01)
x[:, 1] = -np.arange(0, 1, 0.01)
y = np.arange(0, 1, 0.01)
forcedbins_filename = str(
forcedbins_filename = (
Path(__file__).absolute().parents[2] / 'examples' / 'regression' / 'forced_bins.json'
)
params = {'objective': 'regression_l1',
......@@ -2285,7 +2285,7 @@ def test_forced_bins():
est = lgb.train(params, lgb_x, num_boost_round=20)
predicted = est.predict(new_x)
assert len(np.unique(predicted)) == 3
params['forcedbins_filename'] = str(
params['forcedbins_filename'] = (
Path(__file__).absolute().parents[2] / 'examples' / 'regression' / 'forced_bins2.json'
)
params['max_bin'] = 11
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment