Unverified Commit 1b792e71 authored by James Lamb's avatar James Lamb Committed by GitHub
Browse files

[ci] [python-package] enable ruff-format on tests and examples (#6317)

parent b60068c8
......@@ -7,6 +7,12 @@ exclude: |
)$
repos:
- repo: https://github.com/pycqa/isort
rev: 5.13.2
hooks:
- id: isort
name: isort (python)
args: ["--settings-path", "python-package/pyproject.toml"]
- repo: https://github.com/astral-sh/ruff-pre-commit
# Ruff version.
rev: v0.2.1
......@@ -14,12 +20,8 @@ repos:
# Run the linter.
- id: ruff
args: ["--config", "python-package/pyproject.toml"]
types_or: [python, jupyter]
# Run the formatter.
- id: ruff-format
args: ["--config", "python-package/pyproject.toml"]
- repo: https://github.com/pycqa/isort
rev: 5.13.2
hooks:
- id: isort
name: isort (python)
args: ["--settings-path", "python-package/pyproject.toml"]
types_or: [python, jupyter]
......@@ -10,13 +10,13 @@ from sklearn.metrics import roc_auc_score
import lightgbm as lgb
print('Loading data...')
print("Loading data...")
# load or create your dataset
binary_example_dir = Path(__file__).absolute().parents[1] / 'binary_classification'
df_train = pd.read_csv(str(binary_example_dir / 'binary.train'), header=None, sep='\t')
df_test = pd.read_csv(str(binary_example_dir / 'binary.test'), header=None, sep='\t')
W_train = pd.read_csv(str(binary_example_dir / 'binary.train.weight'), header=None)[0]
W_test = pd.read_csv(str(binary_example_dir / 'binary.test.weight'), header=None)[0]
binary_example_dir = Path(__file__).absolute().parents[1] / "binary_classification"
df_train = pd.read_csv(str(binary_example_dir / "binary.train"), header=None, sep="\t")
df_test = pd.read_csv(str(binary_example_dir / "binary.test"), header=None, sep="\t")
W_train = pd.read_csv(str(binary_example_dir / "binary.train.weight"), header=None)[0]
W_test = pd.read_csv(str(binary_example_dir / "binary.test.weight"), header=None)[0]
y_train = df_train[0]
y_test = df_test[0]
......@@ -27,72 +27,72 @@ num_train, num_feature = X_train.shape
# create dataset for lightgbm
# if you want to re-use data, remember to set free_raw_data=False
lgb_train = lgb.Dataset(X_train, y_train,
weight=W_train, free_raw_data=False)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train,
weight=W_test, free_raw_data=False)
lgb_train = lgb.Dataset(X_train, y_train, weight=W_train, free_raw_data=False)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train, weight=W_test, free_raw_data=False)
# specify your configurations as a dict
params = {
'boosting_type': 'gbdt',
'objective': 'binary',
'metric': 'binary_logloss',
'num_leaves': 31,
'learning_rate': 0.05,
'feature_fraction': 0.9,
'bagging_fraction': 0.8,
'bagging_freq': 5,
'verbose': 0
"boosting_type": "gbdt",
"objective": "binary",
"metric": "binary_logloss",
"num_leaves": 31,
"learning_rate": 0.05,
"feature_fraction": 0.9,
"bagging_fraction": 0.8,
"bagging_freq": 5,
"verbose": 0,
}
# generate feature names
feature_name = [f'feature_{col}' for col in range(num_feature)]
feature_name = [f"feature_{col}" for col in range(num_feature)]
print('Starting training...')
print("Starting training...")
# feature_name and categorical_feature
gbm = lgb.train(params,
lgb_train,
num_boost_round=10,
valid_sets=lgb_train, # eval training data
feature_name=feature_name,
categorical_feature=[21])
print('Finished first 10 rounds...')
gbm = lgb.train(
params,
lgb_train,
num_boost_round=10,
valid_sets=lgb_train, # eval training data
feature_name=feature_name,
categorical_feature=[21],
)
print("Finished first 10 rounds...")
# check feature name
print(f'7th feature name is: {lgb_train.feature_name[6]}')
print(f"7th feature name is: {lgb_train.feature_name[6]}")
print('Saving model...')
print("Saving model...")
# save model to file
gbm.save_model('model.txt')
gbm.save_model("model.txt")
print('Dumping model to JSON...')
print("Dumping model to JSON...")
# dump model to JSON (and save to file)
model_json = gbm.dump_model()
with open('model.json', 'w+') as f:
with open("model.json", "w+") as f:
json.dump(model_json, f, indent=4)
# feature names
print(f'Feature names: {gbm.feature_name()}')
print(f"Feature names: {gbm.feature_name()}")
# feature importances
print(f'Feature importances: {list(gbm.feature_importance())}')
print(f"Feature importances: {list(gbm.feature_importance())}")
print('Loading model to predict...')
print("Loading model to predict...")
# load model to predict
bst = lgb.Booster(model_file='model.txt')
bst = lgb.Booster(model_file="model.txt")
# can only predict with the best iteration (or the saving iteration)
y_pred = bst.predict(X_test)
# eval with loaded model
auc_loaded_model = roc_auc_score(y_test, y_pred)
print(f"The ROC AUC of loaded model's prediction is: {auc_loaded_model}")
print('Dumping and loading model with pickle...')
print("Dumping and loading model with pickle...")
# dump model with pickle
with open('model.pkl', 'wb') as fout:
with open("model.pkl", "wb") as fout:
pickle.dump(gbm, fout)
# load model with pickle to predict
with open('model.pkl', 'rb') as fin:
with open("model.pkl", "rb") as fin:
pkl_bst = pickle.load(fin)
# can predict with any iteration when loaded in pickle way
y_pred = pkl_bst.predict(X_test, num_iteration=7)
......@@ -104,36 +104,36 @@ print(f"The ROC AUC of pickled model's prediction is: {auc_pickled_model}")
# init_model accepts:
# 1. model file name
# 2. Booster()
gbm = lgb.train(params,
lgb_train,
num_boost_round=10,
init_model='model.txt',
valid_sets=lgb_eval)
gbm = lgb.train(params, lgb_train, num_boost_round=10, init_model="model.txt", valid_sets=lgb_eval)
print('Finished 10 - 20 rounds with model file...')
print("Finished 10 - 20 rounds with model file...")
# decay learning rates
# reset_parameter callback accepts:
# 1. list with length = num_boost_round
# 2. function(curr_iter)
gbm = lgb.train(params,
lgb_train,
num_boost_round=10,
init_model=gbm,
valid_sets=lgb_eval,
callbacks=[lgb.reset_parameter(learning_rate=lambda iter: 0.05 * (0.99 ** iter))])
gbm = lgb.train(
params,
lgb_train,
num_boost_round=10,
init_model=gbm,
valid_sets=lgb_eval,
callbacks=[lgb.reset_parameter(learning_rate=lambda iter: 0.05 * (0.99**iter))],
)
print('Finished 20 - 30 rounds with decay learning rates...')
print("Finished 20 - 30 rounds with decay learning rates...")
# change other parameters during training
gbm = lgb.train(params,
lgb_train,
num_boost_round=10,
init_model=gbm,
valid_sets=lgb_eval,
callbacks=[lgb.reset_parameter(bagging_fraction=[0.7] * 5 + [0.6] * 5)])
gbm = lgb.train(
params,
lgb_train,
num_boost_round=10,
init_model=gbm,
valid_sets=lgb_eval,
callbacks=[lgb.reset_parameter(bagging_fraction=[0.7] * 5 + [0.6] * 5)],
)
print('Finished 30 - 40 rounds with changing bagging_fraction...')
print("Finished 30 - 40 rounds with changing bagging_fraction...")
# self-defined objective function
......@@ -141,9 +141,9 @@ print('Finished 30 - 40 rounds with changing bagging_fraction...')
# log likelihood loss
def loglikelihood(preds, train_data):
labels = train_data.get_label()
preds = 1. / (1. + np.exp(-preds))
preds = 1.0 / (1.0 + np.exp(-preds))
grad = preds - labels
hess = preds * (1. - preds)
hess = preds * (1.0 - preds)
return grad, hess
......@@ -156,22 +156,19 @@ def loglikelihood(preds, train_data):
# Keep this in mind when you use the customization
def binary_error(preds, train_data):
labels = train_data.get_label()
preds = 1. / (1. + np.exp(-preds))
return 'error', np.mean(labels != (preds > 0.5)), False
preds = 1.0 / (1.0 + np.exp(-preds))
return "error", np.mean(labels != (preds > 0.5)), False
# Pass custom objective function through params
params_custom_obj = copy.deepcopy(params)
params_custom_obj['objective'] = loglikelihood
params_custom_obj["objective"] = loglikelihood
gbm = lgb.train(params_custom_obj,
lgb_train,
num_boost_round=10,
init_model=gbm,
feval=binary_error,
valid_sets=lgb_eval)
gbm = lgb.train(
params_custom_obj, lgb_train, num_boost_round=10, init_model=gbm, feval=binary_error, valid_sets=lgb_eval
)
print('Finished 40 - 50 rounds with self-defined objective function and eval metric...')
print("Finished 40 - 50 rounds with self-defined objective function and eval metric...")
# another self-defined eval metric
......@@ -183,24 +180,26 @@ print('Finished 40 - 50 rounds with self-defined objective function and eval met
# Keep this in mind when you use the customization
def accuracy(preds, train_data):
labels = train_data.get_label()
preds = 1. / (1. + np.exp(-preds))
return 'accuracy', np.mean(labels == (preds > 0.5)), True
preds = 1.0 / (1.0 + np.exp(-preds))
return "accuracy", np.mean(labels == (preds > 0.5)), True
# Pass custom objective function through params
params_custom_obj = copy.deepcopy(params)
params_custom_obj['objective'] = loglikelihood
params_custom_obj["objective"] = loglikelihood
gbm = lgb.train(params_custom_obj,
lgb_train,
num_boost_round=10,
init_model=gbm,
feval=[binary_error, accuracy],
valid_sets=lgb_eval)
gbm = lgb.train(
params_custom_obj,
lgb_train,
num_boost_round=10,
init_model=gbm,
feval=[binary_error, accuracy],
valid_sets=lgb_eval,
)
print('Finished 50 - 60 rounds with self-defined objective function and multiple self-defined eval metrics...')
print("Finished 50 - 60 rounds with self-defined objective function and multiple self-defined eval metrics...")
print('Starting a new training job...')
print("Starting a new training job...")
# callback
......@@ -208,17 +207,14 @@ def reset_metrics():
def callback(env):
lgb_eval_new = lgb.Dataset(X_test, y_test, reference=lgb_train)
if env.iteration - env.begin_iteration == 5:
print('Add a new valid dataset at iteration 5...')
env.model.add_valid(lgb_eval_new, 'new_valid')
print("Add a new valid dataset at iteration 5...")
env.model.add_valid(lgb_eval_new, "new_valid")
callback.before_iteration = True
callback.order = 0
return callback
gbm = lgb.train(params,
lgb_train,
num_boost_round=10,
valid_sets=lgb_train,
callbacks=[reset_metrics()])
gbm = lgb.train(params, lgb_train, num_boost_round=10, valid_sets=lgb_train, callbacks=[reset_metrics()])
print('Finished first 10 rounds with callback function...')
print("Finished first 10 rounds with callback function...")
......@@ -10,9 +10,9 @@ import lightgbm as lgb
if __name__ == "__main__":
print("loading data")
rank_example_dir = Path(__file__).absolute().parents[2] / 'lambdarank'
X, y = load_svmlight_file(str(rank_example_dir / 'rank.train'))
group = np.loadtxt(str(rank_example_dir / 'rank.train.query'))
rank_example_dir = Path(__file__).absolute().parents[2] / "lambdarank"
X, y = load_svmlight_file(str(rank_example_dir / "rank.train"))
group = np.loadtxt(str(rank_example_dir / "rank.train.query"))
print("initializing a Dask cluster")
......@@ -32,25 +32,14 @@ if __name__ == "__main__":
# a sparse boundary to partition the data
X = X.toarray()
dX = da.from_array(
x=X,
chunks=[
(rows_in_part1, rows_in_part2),
(num_features,)
]
)
dX = da.from_array(x=X, chunks=[(rows_in_part1, rows_in_part2), (num_features,)])
dy = da.from_array(
x=y,
chunks=[
(rows_in_part1, rows_in_part2),
]
)
dg = da.from_array(
x=group,
chunks=[
(100, group.size - 100)
]
],
)
dg = da.from_array(x=group, chunks=[(100, group.size - 100)])
print("beginning training")
......
......@@ -34,13 +34,13 @@ def create_dataset_from_multiple_hdf(input_flist, batch_size):
data = []
ylist = []
for f in input_flist:
f = h5py.File(f, 'r')
data.append(HDFSequence(f['X'], batch_size))
ylist.append(f['Y'][:])
f = h5py.File(f, "r")
data.append(HDFSequence(f["X"], batch_size))
ylist.append(f["Y"][:])
params = {
'bin_construct_sample_cnt': 200000,
'max_bin': 255,
"bin_construct_sample_cnt": 200000,
"max_bin": 255,
}
y = np.concatenate(ylist)
dataset = lgb.Dataset(data, label=y, params=params)
......@@ -51,7 +51,7 @@ def create_dataset_from_multiple_hdf(input_flist, batch_size):
# The reason is that DataFrame column names will be used in Dataset. For a DataFrame with Int64Index
# as columns, Dataset will use column names like ["0", "1", "2", ...]. While for numpy array, column names
# are using the default one assigned in C++ code (dataset_loader.cpp), like ["Column_0", "Column_1", ...].
dataset.save_binary('regression.train.from_hdf.bin')
dataset.save_binary("regression.train.from_hdf.bin")
def save2hdf(input_data, fname, batch_size):
......@@ -59,7 +59,7 @@ def save2hdf(input_data, fname, batch_size):
Please note chunk size settings in the implementation for I/O performance optimization.
"""
with h5py.File(fname, 'w') as f:
with h5py.File(fname, "w") as f:
for name, data in input_data.items():
nrow, ncol = data.shape
if ncol == 1:
......@@ -75,12 +75,12 @@ def save2hdf(input_data, fname, batch_size):
# Also note that the data is stored in row major order to avoid extra copy when passing to
# lightgbm Dataset.
chunk = (batch_size, ncol)
f.create_dataset(name, data=data, chunks=chunk, compression='lzf')
f.create_dataset(name, data=data, chunks=chunk, compression="lzf")
def generate_hdf(input_fname, output_basename, batch_size):
# Save to 2 HDF5 files for demonstration.
df = pd.read_csv(input_fname, header=None, sep='\t')
df = pd.read_csv(input_fname, header=None, sep="\t")
mid = len(df) // 2
df1 = df.iloc[:mid]
......@@ -88,25 +88,23 @@ def generate_hdf(input_fname, output_basename, batch_size):
# We can store multiple datasets inside a single HDF5 file.
# Separating X and Y for choosing best chunk size for data loading.
fname1 = f'{output_basename}1.h5'
fname2 = f'{output_basename}2.h5'
save2hdf({'Y': df1.iloc[:, :1], 'X': df1.iloc[:, 1:]}, fname1, batch_size)
save2hdf({'Y': df2.iloc[:, :1], 'X': df2.iloc[:, 1:]}, fname2, batch_size)
fname1 = f"{output_basename}1.h5"
fname2 = f"{output_basename}2.h5"
save2hdf({"Y": df1.iloc[:, :1], "X": df1.iloc[:, 1:]}, fname1, batch_size)
save2hdf({"Y": df2.iloc[:, :1], "X": df2.iloc[:, 1:]}, fname2, batch_size)
return [fname1, fname2]
def main():
batch_size = 64
output_basename = 'regression'
output_basename = "regression"
hdf_files = generate_hdf(
str(Path(__file__).absolute().parents[1] / 'regression' / 'regression.train'),
output_basename,
batch_size
str(Path(__file__).absolute().parents[1] / "regression" / "regression.train"), output_basename, batch_size
)
create_dataset_from_multiple_hdf(hdf_files, batch_size=batch_size)
if __name__ == '__main__':
if __name__ == "__main__":
main()
......@@ -24,23 +24,19 @@ import lightgbm as lgb
# single continuous predictor
np.random.seed(0)
N = 1000
X = pd.DataFrame({
'continuous': range(N),
'categorical': np.repeat([0, 1, 2, 3, 4], N / 5)
})
X = pd.DataFrame({"continuous": range(N), "categorical": np.repeat([0, 1, 2, 3, 4], N / 5)})
CATEGORICAL_EFFECTS = [-1, -1, -2, -2, 2]
LINEAR_TERM = np.array([
-0.5 + 0.01 * X['continuous'][k]
+ CATEGORICAL_EFFECTS[X['categorical'][k]] for k in range(X.shape[0])
]) + np.random.normal(0, 1, X.shape[0])
LINEAR_TERM = np.array(
[-0.5 + 0.01 * X["continuous"][k] + CATEGORICAL_EFFECTS[X["categorical"][k]] for k in range(X.shape[0])]
) + np.random.normal(0, 1, X.shape[0])
TRUE_PROB = expit(LINEAR_TERM)
Y = np.random.binomial(1, TRUE_PROB, size=N)
DATA = {
'X': X,
'probability_labels': TRUE_PROB,
'binary_labels': Y,
'lgb_with_binary_labels': lgb.Dataset(X, Y),
'lgb_with_probability_labels': lgb.Dataset(X, TRUE_PROB),
"X": X,
"probability_labels": TRUE_PROB,
"binary_labels": Y,
"lgb_with_binary_labels": lgb.Dataset(X, Y),
"lgb_with_probability_labels": lgb.Dataset(X, TRUE_PROB),
}
......@@ -72,34 +68,25 @@ def experiment(objective, label_type, data):
np.random.seed(0)
nrounds = 5
lgb_data = data[f"lgb_with_{label_type}_labels"]
params = {
'objective': objective,
'feature_fraction': 1,
'bagging_fraction': 1,
'verbose': -1
}
params = {"objective": objective, "feature_fraction": 1, "bagging_fraction": 1, "verbose": -1}
time_zero = time.time()
gbm = lgb.train(params, lgb_data, num_boost_round=nrounds)
y_fitted = gbm.predict(data['X'])
y_fitted = gbm.predict(data["X"])
y_true = data[f"{label_type}_labels"]
duration = time.time() - time_zero
return {
'time': duration,
'correlation': np.corrcoef(y_fitted, y_true)[0, 1],
'logloss': log_loss(y_fitted, y_true)
}
return {"time": duration, "correlation": np.corrcoef(y_fitted, y_true)[0, 1], "logloss": log_loss(y_fitted, y_true)}
#################
# Observe the behavior of `binary` and `xentropy` objectives
print('Performance of `binary` objective with binary labels:')
print(experiment('binary', label_type='binary', data=DATA))
print("Performance of `binary` objective with binary labels:")
print(experiment("binary", label_type="binary", data=DATA))
print('Performance of `xentropy` objective with binary labels:')
print(experiment('xentropy', label_type='binary', data=DATA))
print("Performance of `xentropy` objective with binary labels:")
print(experiment("xentropy", label_type="binary", data=DATA))
print('Performance of `xentropy` objective with probability labels:')
print(experiment('xentropy', label_type='probability', data=DATA))
print("Performance of `xentropy` objective with probability labels:")
print(experiment("xentropy", label_type="probability", data=DATA))
# Trying this throws an error on non-binary values of y:
# experiment('binary', label_type='probability', DATA)
......@@ -109,9 +96,7 @@ print(experiment('xentropy', label_type='probability', data=DATA))
# there are reasons to suspect that `binary` should run faster when the
# label is an integer instead of a float
K = 10
A = [experiment('binary', label_type='binary', data=DATA)['time']
for k in range(K)]
B = [experiment('xentropy', label_type='binary', data=DATA)['time']
for k in range(K)]
A = [experiment("binary", label_type="binary", data=DATA)["time"] for k in range(K)]
B = [experiment("xentropy", label_type="binary", data=DATA)["time"] for k in range(K)]
print(f"Best `binary` time: {min(A)}")
print(f"Best `xentropy` time: {min(B)}")
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -8,13 +8,13 @@ import lightgbm as lgb
if lgb.compat.MATPLOTLIB_INSTALLED:
import matplotlib.pyplot as plt
else:
raise ImportError('You need to install matplotlib and restart your session for plot_example.py.')
raise ImportError("You need to install matplotlib and restart your session for plot_example.py.")
print('Loading data...')
print("Loading data...")
# load or create your dataset
regression_example_dir = Path(__file__).absolute().parents[1] / 'regression'
df_train = pd.read_csv(str(regression_example_dir / 'regression.train'), header=None, sep='\t')
df_test = pd.read_csv(str(regression_example_dir / 'regression.test'), header=None, sep='\t')
regression_example_dir = Path(__file__).absolute().parents[1] / "regression"
df_train = pd.read_csv(str(regression_example_dir / "regression.train"), header=None, sep="\t")
df_test = pd.read_csv(str(regression_example_dir / "regression.test"), header=None, sep="\t")
y_train = df_train[0]
y_test = df_test[0]
......@@ -26,45 +26,38 @@ lgb_train = lgb.Dataset(X_train, y_train)
lgb_test = lgb.Dataset(X_test, y_test, reference=lgb_train)
# specify your configurations as a dict
params = {
'num_leaves': 5,
'metric': ('l1', 'l2'),
'verbose': 0
}
params = {"num_leaves": 5, "metric": ("l1", "l2"), "verbose": 0}
evals_result = {} # to record eval results for plotting
print('Starting training...')
print("Starting training...")
# train
gbm = lgb.train(
params,
lgb_train,
num_boost_round=100,
valid_sets=[lgb_train, lgb_test],
feature_name=[f'f{i + 1}' for i in range(X_train.shape[-1])],
feature_name=[f"f{i + 1}" for i in range(X_train.shape[-1])],
categorical_feature=[21],
callbacks=[
lgb.log_evaluation(10),
lgb.record_evaluation(evals_result)
]
callbacks=[lgb.log_evaluation(10), lgb.record_evaluation(evals_result)],
)
print('Plotting metrics recorded during training...')
ax = lgb.plot_metric(evals_result, metric='l1')
print("Plotting metrics recorded during training...")
ax = lgb.plot_metric(evals_result, metric="l1")
plt.show()
print('Plotting feature importances...')
print("Plotting feature importances...")
ax = lgb.plot_importance(gbm, max_num_features=10)
plt.show()
print('Plotting split value histogram...')
ax = lgb.plot_split_value_histogram(gbm, feature='f26', bins='auto')
print("Plotting split value histogram...")
ax = lgb.plot_split_value_histogram(gbm, feature="f26", bins="auto")
plt.show()
print('Plotting 54th tree...') # one tree use categorical feature to split
ax = lgb.plot_tree(gbm, tree_index=53, figsize=(15, 15), show_info=['split_gain'])
print("Plotting 54th tree...") # one tree use categorical feature to split
ax = lgb.plot_tree(gbm, tree_index=53, figsize=(15, 15), show_info=["split_gain"])
plt.show()
print('Plotting 54th tree with graphviz...')
graph = lgb.create_tree_digraph(gbm, tree_index=53, name='Tree54')
print("Plotting 54th tree with graphviz...")
graph = lgb.create_tree_digraph(gbm, tree_index=53, name="Tree54")
graph.render(view=True)
......@@ -6,11 +6,11 @@ from sklearn.metrics import mean_squared_error
import lightgbm as lgb
print('Loading data...')
print("Loading data...")
# load or create your dataset
regression_example_dir = Path(__file__).absolute().parents[1] / 'regression'
df_train = pd.read_csv(str(regression_example_dir / 'regression.train'), header=None, sep='\t')
df_test = pd.read_csv(str(regression_example_dir / 'regression.test'), header=None, sep='\t')
regression_example_dir = Path(__file__).absolute().parents[1] / "regression"
df_train = pd.read_csv(str(regression_example_dir / "regression.train"), header=None, sep="\t")
df_test = pd.read_csv(str(regression_example_dir / "regression.test"), header=None, sep="\t")
y_train = df_train[0]
y_test = df_test[0]
......@@ -23,32 +23,30 @@ lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
# specify your configurations as a dict
params = {
'boosting_type': 'gbdt',
'objective': 'regression',
'metric': {'l2', 'l1'},
'num_leaves': 31,
'learning_rate': 0.05,
'feature_fraction': 0.9,
'bagging_fraction': 0.8,
'bagging_freq': 5,
'verbose': 0
"boosting_type": "gbdt",
"objective": "regression",
"metric": {"l2", "l1"},
"num_leaves": 31,
"learning_rate": 0.05,
"feature_fraction": 0.9,
"bagging_fraction": 0.8,
"bagging_freq": 5,
"verbose": 0,
}
print('Starting training...')
print("Starting training...")
# train
gbm = lgb.train(params,
lgb_train,
num_boost_round=20,
valid_sets=lgb_eval,
callbacks=[lgb.early_stopping(stopping_rounds=5)])
gbm = lgb.train(
params, lgb_train, num_boost_round=20, valid_sets=lgb_eval, callbacks=[lgb.early_stopping(stopping_rounds=5)]
)
print('Saving model...')
print("Saving model...")
# save model to file
gbm.save_model('model.txt')
gbm.save_model("model.txt")
print('Starting predicting...')
print("Starting predicting...")
# predict
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
# eval
rmse_test = mean_squared_error(y_test, y_pred) ** 0.5
print(f'The RMSE of prediction is: {rmse_test}')
print(f"The RMSE of prediction is: {rmse_test}")
......@@ -8,85 +8,71 @@ from sklearn.model_selection import GridSearchCV
import lightgbm as lgb
print('Loading data...')
print("Loading data...")
# load or create your dataset
regression_example_dir = Path(__file__).absolute().parents[1] / 'regression'
df_train = pd.read_csv(str(regression_example_dir / 'regression.train'), header=None, sep='\t')
df_test = pd.read_csv(str(regression_example_dir / 'regression.test'), header=None, sep='\t')
regression_example_dir = Path(__file__).absolute().parents[1] / "regression"
df_train = pd.read_csv(str(regression_example_dir / "regression.train"), header=None, sep="\t")
df_test = pd.read_csv(str(regression_example_dir / "regression.test"), header=None, sep="\t")
y_train = df_train[0]
y_test = df_test[0]
X_train = df_train.drop(0, axis=1)
X_test = df_test.drop(0, axis=1)
print('Starting training...')
print("Starting training...")
# train
gbm = lgb.LGBMRegressor(num_leaves=31,
learning_rate=0.05,
n_estimators=20)
gbm.fit(X_train, y_train,
eval_set=[(X_test, y_test)],
eval_metric='l1',
callbacks=[lgb.early_stopping(5)])
print('Starting predicting...')
gbm = lgb.LGBMRegressor(num_leaves=31, learning_rate=0.05, n_estimators=20)
gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], eval_metric="l1", callbacks=[lgb.early_stopping(5)])
print("Starting predicting...")
# predict
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration_)
# eval
rmse_test = mean_squared_error(y_test, y_pred) ** 0.5
print(f'The RMSE of prediction is: {rmse_test}')
print(f"The RMSE of prediction is: {rmse_test}")
# feature importances
print(f'Feature importances: {list(gbm.feature_importances_)}')
print(f"Feature importances: {list(gbm.feature_importances_)}")
# self-defined eval metric
# f(y_true: array, y_pred: array) -> name: str, eval_result: float, is_higher_better: bool
# Root Mean Squared Logarithmic Error (RMSLE)
def rmsle(y_true, y_pred):
return 'RMSLE', np.sqrt(np.mean(np.power(np.log1p(y_pred) - np.log1p(y_true), 2))), False
return "RMSLE", np.sqrt(np.mean(np.power(np.log1p(y_pred) - np.log1p(y_true), 2))), False
print('Starting training with custom eval function...')
print("Starting training with custom eval function...")
# train
gbm.fit(X_train, y_train,
eval_set=[(X_test, y_test)],
eval_metric=rmsle,
callbacks=[lgb.early_stopping(5)])
gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], eval_metric=rmsle, callbacks=[lgb.early_stopping(5)])
# another self-defined eval metric
# f(y_true: array, y_pred: array) -> name: str, eval_result: float, is_higher_better: bool
# Relative Absolute Error (RAE)
def rae(y_true, y_pred):
return 'RAE', np.sum(np.abs(y_pred - y_true)) / np.sum(np.abs(np.mean(y_true) - y_true)), False
return "RAE", np.sum(np.abs(y_pred - y_true)) / np.sum(np.abs(np.mean(y_true) - y_true)), False
print('Starting training with multiple custom eval functions...')
print("Starting training with multiple custom eval functions...")
# train
gbm.fit(X_train, y_train,
eval_set=[(X_test, y_test)],
eval_metric=[rmsle, rae],
callbacks=[lgb.early_stopping(5)])
gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], eval_metric=[rmsle, rae], callbacks=[lgb.early_stopping(5)])
print('Starting predicting...')
print("Starting predicting...")
# predict
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration_)
# eval
rmsle_test = rmsle(y_test, y_pred)[1]
rae_test = rae(y_test, y_pred)[1]
print(f'The RMSLE of prediction is: {rmsle_test}')
print(f'The RAE of prediction is: {rae_test}')
print(f"The RMSLE of prediction is: {rmsle_test}")
print(f"The RAE of prediction is: {rae_test}")
# other scikit-learn modules
estimator = lgb.LGBMRegressor(num_leaves=31)
param_grid = {
'learning_rate': [0.01, 0.1, 1],
'n_estimators': [20, 40]
}
param_grid = {"learning_rate": [0.01, 0.1, 1], "n_estimators": [20, 40]}
gbm = GridSearchCV(estimator, param_grid, cv=3)
gbm.fit(X_train, y_train)
print(f'Best parameters found by grid search are: {gbm.best_params_}')
print(f"Best parameters found by grid search are: {gbm.best_params_}")
......@@ -18,9 +18,23 @@ from typing import TYPE_CHECKING, Any, Callable, Dict, Iterable, List, Optional,
import numpy as np
import scipy.sparse
from .compat import (PANDAS_INSTALLED, PYARROW_INSTALLED, arrow_cffi, arrow_is_floating, arrow_is_integer, concat,
dt_DataTable, pa_Array, pa_chunked_array, pa_ChunkedArray, pa_compute, pa_Table,
pd_CategoricalDtype, pd_DataFrame, pd_Series)
from .compat import (
PANDAS_INSTALLED,
PYARROW_INSTALLED,
arrow_cffi,
arrow_is_floating,
arrow_is_integer,
concat,
dt_DataTable,
pa_Array,
pa_chunked_array,
pa_ChunkedArray,
pa_compute,
pa_Table,
pd_CategoricalDtype,
pd_DataFrame,
pd_Series,
)
from .libpath import find_lib_path
if TYPE_CHECKING:
......
......@@ -5,8 +5,14 @@ from dataclasses import dataclass
from functools import partial
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union
from .basic import (Booster, _ConfigAliases, _LGBM_BoosterEvalMethodResultType,
_LGBM_BoosterEvalMethodResultWithStandardDeviationType, _log_info, _log_warning)
from .basic import (
Booster,
_ConfigAliases,
_LGBM_BoosterEvalMethodResultType,
_LGBM_BoosterEvalMethodResultWithStandardDeviationType,
_log_info,
_log_warning,
)
if TYPE_CHECKING:
from .engine import CVBooster
......
......@@ -19,12 +19,36 @@ import numpy as np
import scipy.sparse as ss
from .basic import LightGBMError, _choose_param_value, _ConfigAliases, _log_info, _log_warning
from .compat import (DASK_INSTALLED, PANDAS_INSTALLED, SKLEARN_INSTALLED, Client, Future, LGBMNotFittedError, concat,
dask_Array, dask_array_from_delayed, dask_bag_from_delayed, dask_DataFrame, dask_Series,
default_client, delayed, pd_DataFrame, pd_Series, wait)
from .sklearn import (LGBMClassifier, LGBMModel, LGBMRanker, LGBMRegressor, _LGBM_ScikitCustomObjectiveFunction,
_LGBM_ScikitEvalMetricType, _lgbmmodel_doc_custom_eval_note, _lgbmmodel_doc_fit,
_lgbmmodel_doc_predict)
from .compat import (
DASK_INSTALLED,
PANDAS_INSTALLED,
SKLEARN_INSTALLED,
Client,
Future,
LGBMNotFittedError,
concat,
dask_Array,
dask_array_from_delayed,
dask_bag_from_delayed,
dask_DataFrame,
dask_Series,
default_client,
delayed,
pd_DataFrame,
pd_Series,
wait,
)
from .sklearn import (
LGBMClassifier,
LGBMModel,
LGBMRanker,
LGBMRegressor,
_LGBM_ScikitCustomObjectiveFunction,
_LGBM_ScikitEvalMetricType,
_lgbmmodel_doc_custom_eval_note,
_lgbmmodel_doc_fit,
_lgbmmodel_doc_predict,
)
__all__ = [
'DaskLGBMClassifier',
......
......@@ -10,10 +10,21 @@ from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
import numpy as np
from . import callback
from .basic import (Booster, Dataset, LightGBMError, _choose_param_value, _ConfigAliases, _InnerPredictor,
_LGBM_BoosterEvalMethodResultType, _LGBM_BoosterEvalMethodResultWithStandardDeviationType,
_LGBM_CategoricalFeatureConfiguration, _LGBM_CustomObjectiveFunction, _LGBM_EvalFunctionResultType,
_LGBM_FeatureNameConfiguration, _log_warning)
from .basic import (
Booster,
Dataset,
LightGBMError,
_choose_param_value,
_ConfigAliases,
_InnerPredictor,
_LGBM_BoosterEvalMethodResultType,
_LGBM_BoosterEvalMethodResultWithStandardDeviationType,
_LGBM_CategoricalFeatureConfiguration,
_LGBM_CustomObjectiveFunction,
_LGBM_EvalFunctionResultType,
_LGBM_FeatureNameConfiguration,
_log_warning,
)
from .compat import SKLEARN_INSTALLED, _LGBMBaseCrossValidator, _LGBMGroupKFold, _LGBMStratifiedKFold
__all__ = [
......
......@@ -8,14 +8,41 @@ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
import numpy as np
import scipy.sparse
from .basic import (Booster, Dataset, LightGBMError, _choose_param_value, _ConfigAliases, _LGBM_BoosterBestScoreType,
_LGBM_CategoricalFeatureConfiguration, _LGBM_EvalFunctionResultType, _LGBM_FeatureNameConfiguration,
_LGBM_GroupType, _LGBM_InitScoreType, _LGBM_LabelType, _LGBM_WeightType, _log_warning)
from .basic import (
Booster,
Dataset,
LightGBMError,
_choose_param_value,
_ConfigAliases,
_LGBM_BoosterBestScoreType,
_LGBM_CategoricalFeatureConfiguration,
_LGBM_EvalFunctionResultType,
_LGBM_FeatureNameConfiguration,
_LGBM_GroupType,
_LGBM_InitScoreType,
_LGBM_LabelType,
_LGBM_WeightType,
_log_warning,
)
from .callback import _EvalResultDict, record_evaluation
from .compat import (SKLEARN_INSTALLED, LGBMNotFittedError, _LGBMAssertAllFinite, _LGBMCheckArray,
_LGBMCheckClassificationTargets, _LGBMCheckSampleWeight, _LGBMCheckXY, _LGBMClassifierBase,
_LGBMComputeSampleWeight, _LGBMCpuCount, _LGBMLabelEncoder, _LGBMModelBase, _LGBMRegressorBase,
dt_DataTable, np_random_Generator, pd_DataFrame)
from .compat import (
SKLEARN_INSTALLED,
LGBMNotFittedError,
_LGBMAssertAllFinite,
_LGBMCheckArray,
_LGBMCheckClassificationTargets,
_LGBMCheckSampleWeight,
_LGBMCheckXY,
_LGBMClassifierBase,
_LGBMComputeSampleWeight,
_LGBMCpuCount,
_LGBMLabelEncoder,
_LGBMModelBase,
_LGBMRegressorBase,
dt_DataTable,
np_random_Generator,
pd_DataFrame,
)
from .engine import train
__all__ = [
......
......@@ -81,10 +81,14 @@ minimum-version = "0.4.4"
# end:build-system
[tool.isort]
include_trailing_comma = true
line_length = 120
# "vertical hanging indent", to match what ruff-format does
# ref: https://pycqa.github.io/isort/docs/configuration/multi_line_output_modes.html#3-vertical-hanging-indent
multi_line_output = 3
skip_glob = [
"*/external_libs/*",
"*/lightgbm-python/*"
"*/lightgbm-python/*",
]
[tool.mypy]
......@@ -108,14 +112,13 @@ docstring-code-format = false
exclude = [
"build/*.py",
"compile/*.py",
"examples/*.py",
"external_libs/*.py",
"lightgbm-python/*.py",
"python-package/*.py",
"tests/*.py"
]
indent-style = "space"
quote-style = "double"
skip-magic-trailing-comma = false
[tool.ruff.lint]
ignore = [
......
......@@ -10,7 +10,7 @@ try:
from lightgbm.basic import _LIB as LIB
except ModuleNotFoundError:
print("Could not import lightgbm Python package, looking for lib_lightgbm at the repo root")
if system() in ('Windows', 'Microsoft'):
if system() in ("Windows", "Microsoft"):
lib_file = Path(__file__).absolute().parents[2] / "Release" / "lib_lightgbm.dll"
else:
lib_file = Path(__file__).absolute().parents[2] / "lib_lightgbm.so"
......@@ -25,7 +25,7 @@ dtype_int64 = 3
def c_str(string):
return ctypes.c_char_p(string.encode('utf-8'))
return ctypes.c_char_p(string.encode("utf-8"))
def load_from_file(filename, reference):
......@@ -33,17 +33,13 @@ def load_from_file(filename, reference):
if reference is not None:
ref = reference
handle = ctypes.c_void_p()
LIB.LGBM_DatasetCreateFromFile(
c_str(str(filename)),
c_str('max_bin=15'),
ref,
ctypes.byref(handle))
LIB.LGBM_DatasetCreateFromFile(c_str(str(filename)), c_str("max_bin=15"), ref, ctypes.byref(handle))
print(LIB.LGBM_GetLastError())
num_data = ctypes.c_int(0)
LIB.LGBM_DatasetGetNumData(handle, ctypes.byref(num_data))
num_feature = ctypes.c_int(0)
LIB.LGBM_DatasetGetNumFeature(handle, ctypes.byref(num_feature))
print(f'#data: {num_data.value} #feature: {num_feature.value}')
print(f"#data: {num_data.value} #feature: {num_feature.value}")
return handle
......@@ -69,20 +65,22 @@ def load_from_csr(filename, reference):
ctypes.c_int64(len(csr.indptr)),
ctypes.c_int64(len(csr.data)),
ctypes.c_int64(csr.shape[1]),
c_str('max_bin=15'),
c_str("max_bin=15"),
ref,
ctypes.byref(handle))
ctypes.byref(handle),
)
num_data = ctypes.c_int(0)
LIB.LGBM_DatasetGetNumData(handle, ctypes.byref(num_data))
num_feature = ctypes.c_int(0)
LIB.LGBM_DatasetGetNumFeature(handle, ctypes.byref(num_feature))
LIB.LGBM_DatasetSetField(
handle,
c_str('label'),
c_str("label"),
label.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
ctypes.c_int(len(label)),
ctypes.c_int(dtype_float32))
print(f'#data: {num_data.value} #feature: {num_feature.value}')
ctypes.c_int(dtype_float32),
)
print(f"#data: {num_data.value} #feature: {num_feature.value}")
return handle
......@@ -104,20 +102,22 @@ def load_from_csc(filename, reference):
ctypes.c_int64(len(csc.indptr)),
ctypes.c_int64(len(csc.data)),
ctypes.c_int64(csc.shape[0]),
c_str('max_bin=15'),
c_str("max_bin=15"),
ref,
ctypes.byref(handle))
ctypes.byref(handle),
)
num_data = ctypes.c_int(0)
LIB.LGBM_DatasetGetNumData(handle, ctypes.byref(num_data))
num_feature = ctypes.c_int(0)
LIB.LGBM_DatasetGetNumFeature(handle, ctypes.byref(num_feature))
LIB.LGBM_DatasetSetField(
handle,
c_str('label'),
c_str("label"),
label.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
ctypes.c_int(len(label)),
ctypes.c_int(dtype_float32))
print(f'#data: {num_data.value} #feature: {num_feature.value}')
ctypes.c_int(dtype_float32),
)
print(f"#data: {num_data.value} #feature: {num_feature.value}")
return handle
......@@ -137,20 +137,22 @@ def load_from_mat(filename, reference):
ctypes.c_int32(mat.shape[0]),
ctypes.c_int32(mat.shape[1]),
ctypes.c_int(1),
c_str('max_bin=15'),
c_str("max_bin=15"),
ref,
ctypes.byref(handle))
ctypes.byref(handle),
)
num_data = ctypes.c_int(0)
LIB.LGBM_DatasetGetNumData(handle, ctypes.byref(num_data))
num_feature = ctypes.c_int(0)
LIB.LGBM_DatasetGetNumFeature(handle, ctypes.byref(num_feature))
LIB.LGBM_DatasetSetField(
handle,
c_str('label'),
c_str("label"),
label.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
ctypes.c_int(len(label)),
ctypes.c_int(dtype_float32))
print(f'#data: {num_data.value} #feature: {num_feature.value}')
ctypes.c_int(dtype_float32),
)
print(f"#data: {num_data.value} #feature: {num_feature.value}")
return handle
......@@ -159,29 +161,26 @@ def free_dataset(handle):
def test_dataset():
binary_example_dir = Path(__file__).absolute().parents[2] / 'examples' / 'binary_classification'
train = load_from_file(binary_example_dir / 'binary.train', None)
test = load_from_mat(binary_example_dir / 'binary.test', train)
binary_example_dir = Path(__file__).absolute().parents[2] / "examples" / "binary_classification"
train = load_from_file(binary_example_dir / "binary.train", None)
test = load_from_mat(binary_example_dir / "binary.test", train)
free_dataset(test)
test = load_from_csr(binary_example_dir / 'binary.test', train)
test = load_from_csr(binary_example_dir / "binary.test", train)
free_dataset(test)
test = load_from_csc(binary_example_dir / 'binary.test', train)
test = load_from_csc(binary_example_dir / "binary.test", train)
free_dataset(test)
save_to_binary(train, 'train.binary.bin')
save_to_binary(train, "train.binary.bin")
free_dataset(train)
train = load_from_file('train.binary.bin', None)
train = load_from_file("train.binary.bin", None)
free_dataset(train)
def test_booster():
binary_example_dir = Path(__file__).absolute().parents[2] / 'examples' / 'binary_classification'
train = load_from_mat(binary_example_dir / 'binary.train', None)
test = load_from_mat(binary_example_dir / 'binary.test', train)
binary_example_dir = Path(__file__).absolute().parents[2] / "examples" / "binary_classification"
train = load_from_mat(binary_example_dir / "binary.train", None)
test = load_from_mat(binary_example_dir / "binary.test", train)
booster = ctypes.c_void_p()
LIB.LGBM_BoosterCreate(
train,
c_str("app=binary metric=auc num_leaves=31 verbose=0"),
ctypes.byref(booster))
LIB.LGBM_BoosterCreate(train, c_str("app=binary metric=auc num_leaves=31 verbose=0"), ctypes.byref(booster))
LIB.LGBM_BoosterAddValidData(booster, test)
is_finished = ctypes.c_int(0)
for i in range(1, 51):
......@@ -189,28 +188,18 @@ def test_booster():
result = np.array([0.0], dtype=np.float64)
out_len = ctypes.c_int(0)
LIB.LGBM_BoosterGetEval(
booster,
ctypes.c_int(0),
ctypes.byref(out_len),
result.ctypes.data_as(ctypes.POINTER(ctypes.c_double)))
booster, ctypes.c_int(0), ctypes.byref(out_len), result.ctypes.data_as(ctypes.POINTER(ctypes.c_double))
)
if i % 10 == 0:
print(f'{i} iteration test AUC {result[0]:.6f}')
LIB.LGBM_BoosterSaveModel(
booster,
ctypes.c_int(0),
ctypes.c_int(-1),
ctypes.c_int(0),
c_str('model.txt'))
print(f"{i} iteration test AUC {result[0]:.6f}")
LIB.LGBM_BoosterSaveModel(booster, ctypes.c_int(0), ctypes.c_int(-1), ctypes.c_int(0), c_str("model.txt"))
LIB.LGBM_BoosterFree(booster)
free_dataset(train)
free_dataset(test)
booster2 = ctypes.c_void_p()
num_total_model = ctypes.c_int(0)
LIB.LGBM_BoosterCreateFromModelfile(
c_str('model.txt'),
ctypes.byref(num_total_model),
ctypes.byref(booster2))
data = np.loadtxt(str(binary_example_dir / 'binary.test'), dtype=np.float64)
LIB.LGBM_BoosterCreateFromModelfile(c_str("model.txt"), ctypes.byref(num_total_model), ctypes.byref(booster2))
data = np.loadtxt(str(binary_example_dir / "binary.test"), dtype=np.float64)
mat = data[:, 1:]
preb = np.empty(mat.shape[0], dtype=np.float64)
num_preb = ctypes.c_int64(0)
......@@ -225,58 +214,51 @@ def test_booster():
ctypes.c_int(1),
ctypes.c_int(0),
ctypes.c_int(25),
c_str(''),
c_str(""),
ctypes.byref(num_preb),
preb.ctypes.data_as(ctypes.POINTER(ctypes.c_double)))
preb.ctypes.data_as(ctypes.POINTER(ctypes.c_double)),
)
LIB.LGBM_BoosterPredictForFile(
booster2,
c_str(str(binary_example_dir / 'binary.test')),
c_str(str(binary_example_dir / "binary.test")),
ctypes.c_int(0),
ctypes.c_int(0),
ctypes.c_int(0),
ctypes.c_int(25),
c_str(''),
c_str('preb.txt'))
c_str(""),
c_str("preb.txt"),
)
LIB.LGBM_BoosterPredictForFile(
booster2,
c_str(str(binary_example_dir / 'binary.test')),
c_str(str(binary_example_dir / "binary.test")),
ctypes.c_int(0),
ctypes.c_int(0),
ctypes.c_int(10),
ctypes.c_int(25),
c_str(''),
c_str('preb.txt'))
c_str(""),
c_str("preb.txt"),
)
LIB.LGBM_BoosterFree(booster2)
def test_max_thread_control():
# at initialization, should be -1
num_threads = ctypes.c_int(0)
ret = LIB.LGBM_GetMaxThreads(
ctypes.byref(num_threads)
)
ret = LIB.LGBM_GetMaxThreads(ctypes.byref(num_threads))
assert ret == 0
assert num_threads.value == -1
# updating that value through the C API should work
ret = LIB.LGBM_SetMaxThreads(
ctypes.c_int(6)
)
ret = LIB.LGBM_SetMaxThreads(ctypes.c_int(6))
assert ret == 0
ret = LIB.LGBM_GetMaxThreads(
ctypes.byref(num_threads)
)
ret = LIB.LGBM_GetMaxThreads(ctypes.byref(num_threads))
assert ret == 0
assert num_threads.value == 6
# resetting to any negative number should set it to -1
ret = LIB.LGBM_SetMaxThreads(
ctypes.c_int(-123)
)
ret = LIB.LGBM_SetMaxThreads(ctypes.c_int(-123))
assert ret == 0
ret = LIB.LGBM_GetMaxThreads(
ctypes.byref(num_threads)
)
ret = LIB.LGBM_GetMaxThreads(ctypes.byref(num_threads))
assert ret == 0
assert num_threads.value == -1
......@@ -3,5 +3,5 @@ from pathlib import Path
import numpy as np
preds = [np.loadtxt(str(name)) for name in Path(__file__).absolute().parent.glob('*.pred')]
preds = [np.loadtxt(str(name)) for name in Path(__file__).absolute().parent.glob("*.pred")]
np.testing.assert_allclose(preds[0], preds[1])
......@@ -14,16 +14,16 @@ from sklearn.metrics import accuracy_score
TESTS_DIR = Path(__file__).absolute().parent
@pytest.fixture(scope='module')
@pytest.fixture(scope="module")
def executable(pytestconfig) -> str:
"""Returns the path to the lightgbm executable."""
return pytestconfig.getoption('execfile')
return pytestconfig.getoption("execfile")
def _find_random_open_port() -> int:
"""Find a random open port on localhost."""
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
s.bind(('', 0))
s.bind(("", 0))
port = s.getsockname()[1]
return port # noqa: RET504
......@@ -34,7 +34,7 @@ def _generate_n_ports(n: int) -> Generator[int, None, None]:
def _write_dict(d: Dict, file: io.TextIOWrapper) -> None:
for k, v in d.items():
file.write(f'{k} = {v}\n')
file.write(f"{k} = {v}\n")
def create_data(task: str, n_samples: int = 1_000) -> np.ndarray:
......@@ -42,10 +42,10 @@ def create_data(task: str, n_samples: int = 1_000) -> np.ndarray:
The data is returned as a numpy array with the label as the first column.
"""
if task == 'binary-classification':
if task == "binary-classification":
centers = [[-4, -4], [4, 4]]
X, y = make_blobs(n_samples, centers=centers, random_state=42)
elif task == 'regression':
elif task == "regression":
X, y = make_regression(n_samples, n_features=4, n_informative=2, random_state=42)
return np.hstack([y.reshape(-1, 1), X])
......@@ -54,22 +54,22 @@ class DistributedMockup:
"""Simulate distributed training."""
default_train_config = {
'task': 'train',
'pre_partition': True,
'machine_list_file': TESTS_DIR / 'mlist.txt',
'tree_learner': 'data',
'force_row_wise': True,
'verbose': 0,
'num_boost_round': 20,
'num_leaves': 15,
'num_threads': 2,
"task": "train",
"pre_partition": True,
"machine_list_file": TESTS_DIR / "mlist.txt",
"tree_learner": "data",
"force_row_wise": True,
"verbose": 0,
"num_boost_round": 20,
"num_leaves": 15,
"num_threads": 2,
}
default_predict_config = {
'task': 'predict',
'data': TESTS_DIR / 'train.txt',
'input_model': TESTS_DIR / 'model0.txt',
'output_result': TESTS_DIR / 'predictions.txt',
"task": "predict",
"data": TESTS_DIR / "train.txt",
"input_model": TESTS_DIR / "model0.txt",
"output_result": TESTS_DIR / "predictions.txt",
}
def __init__(self, executable: str):
......@@ -77,8 +77,8 @@ class DistributedMockup:
def worker_train(self, i: int) -> subprocess.CompletedProcess:
"""Start the training process on the `i`-th worker."""
config_path = TESTS_DIR / f'train{i}.conf'
cmd = [self.executable, f'config={config_path}']
config_path = TESTS_DIR / f"train{i}.conf"
cmd = [self.executable, f"config={config_path}"]
return subprocess.run(cmd)
def _set_ports(self) -> None:
......@@ -92,18 +92,18 @@ class DistributedMockup:
ports.update(candidates)
i += 1
if i == max_tries:
raise RuntimeError('Unable to find non-colliding ports.')
raise RuntimeError("Unable to find non-colliding ports.")
self.listen_ports = list(ports)
with open(TESTS_DIR / 'mlist.txt', 'wt') as f:
with open(TESTS_DIR / "mlist.txt", "wt") as f:
for port in self.listen_ports:
f.write(f'127.0.0.1 {port}\n')
f.write(f"127.0.0.1 {port}\n")
def _write_data(self, partitions: List[np.ndarray]) -> None:
"""Write all training data as train.txt and each training partition as train{i}.txt."""
all_data = np.vstack(partitions)
np.savetxt(str(TESTS_DIR / 'train.txt'), all_data, delimiter=',')
np.savetxt(str(TESTS_DIR / "train.txt"), all_data, delimiter=",")
for i, partition in enumerate(partitions):
np.savetxt(str(TESTS_DIR / f'train{i}.txt'), partition, delimiter=',')
np.savetxt(str(TESTS_DIR / f"train{i}.txt"), partition, delimiter=",")
def fit(self, partitions: List[np.ndarray], train_config: Dict) -> None:
"""Run the distributed training process on a single machine.
......@@ -118,7 +118,7 @@ class DistributedMockup:
"""
self.train_config = copy.deepcopy(self.default_train_config)
self.train_config.update(train_config)
self.n_workers = self.train_config['num_machines']
self.n_workers = self.train_config["num_machines"]
self._set_ports()
self._write_data(partitions)
self.label_ = np.hstack([partition[:, 0] for partition in partitions])
......@@ -131,7 +131,7 @@ class DistributedMockup:
results = [f.result() for f in futures]
for result in results:
if result.returncode != 0:
raise RuntimeError('Error in training')
raise RuntimeError("Error in training")
def predict(self, predict_config: Dict[str, Any]) -> np.ndarray:
"""Compute the predictions using the model created in the fit step.
......@@ -141,14 +141,14 @@ class DistributedMockup:
"""
self.predict_config = copy.deepcopy(self.default_predict_config)
self.predict_config.update(predict_config)
config_path = TESTS_DIR / 'predict.conf'
with open(config_path, 'wt') as file:
config_path = TESTS_DIR / "predict.conf"
with open(config_path, "wt") as file:
_write_dict(self.predict_config, file)
cmd = [self.executable, f'config={config_path}']
cmd = [self.executable, f"config={config_path}"]
result = subprocess.run(cmd)
if result.returncode != 0:
raise RuntimeError('Error in prediction')
return np.loadtxt(str(TESTS_DIR / 'predictions.txt'))
raise RuntimeError("Error in prediction")
return np.loadtxt(str(TESTS_DIR / "predictions.txt"))
def write_train_config(self, i: int) -> None:
"""Create a file train{i}.conf with the required configuration to train.
......@@ -156,41 +156,41 @@ class DistributedMockup:
Each worker gets a different port and piece of the data, the rest are the
model parameters contained in `self.config`.
"""
with open(TESTS_DIR / f'train{i}.conf', 'wt') as file:
output_model = TESTS_DIR / f'model{i}.txt'
data = TESTS_DIR / f'train{i}.txt'
file.write(f'output_model = {output_model}\n')
file.write(f'local_listen_port = {self.listen_ports[i]}\n')
file.write(f'data = {data}\n')
with open(TESTS_DIR / f"train{i}.conf", "wt") as file:
output_model = TESTS_DIR / f"model{i}.txt"
data = TESTS_DIR / f"train{i}.txt"
file.write(f"output_model = {output_model}\n")
file.write(f"local_listen_port = {self.listen_ports[i]}\n")
file.write(f"data = {data}\n")
_write_dict(self.train_config, file)
def test_classifier(executable):
"""Test the classification task."""
num_machines = 2
data = create_data(task='binary-classification')
data = create_data(task="binary-classification")
partitions = np.array_split(data, num_machines)
train_params = {
'objective': 'binary',
'num_machines': num_machines,
"objective": "binary",
"num_machines": num_machines,
}
clf = DistributedMockup(executable)
clf.fit(partitions, train_params)
y_probas = clf.predict(predict_config={})
y_pred = y_probas > 0.5
assert accuracy_score(clf.label_, y_pred) == 1.
assert accuracy_score(clf.label_, y_pred) == 1.0
def test_regressor(executable):
"""Test the regression task."""
num_machines = 2
data = create_data(task='regression')
data = create_data(task="regression")
partitions = np.array_split(data, num_machines)
train_params = {
'objective': 'regression',
'num_machines': num_machines,
"objective": "regression",
"num_machines": num_machines,
}
reg = DistributedMockup(executable)
reg.fit(partitions, train_params)
y_pred = reg.predict(predict_config={})
np.testing.assert_allclose(y_pred, reg.label_, rtol=0.2, atol=50.)
np.testing.assert_allclose(y_pred, reg.label_, rtol=0.2, atol=50.0)
from pathlib import Path
default_exec_file = Path(__file__).absolute().parents[2] / 'lightgbm'
default_exec_file = Path(__file__).absolute().parents[2] / "lightgbm"
def pytest_addoption(parser):
parser.addoption('--execfile', action='store', default=str(default_exec_file))
parser.addoption("--execfile", action="store", default=str(default_exec_file))
......@@ -71,9 +71,7 @@ def generate_random_arrow_table(
values: Optional[np.ndarray] = None,
) -> pa.Table:
columns = [
generate_random_arrow_array(
num_datapoints, seed + i, generate_nulls=generate_nulls, values=values
)
generate_random_arrow_array(num_datapoints, seed + i, generate_nulls=generate_nulls, values=values)
for i in range(num_columns)
]
names = [f"col_{i}" for i in range(num_columns)]
......@@ -156,9 +154,7 @@ def test_dataset_construct_fields_fuzzy():
arrow_weights = generate_random_arrow_array(1000, 42, generate_nulls=False)
arrow_groups = pa.chunked_array([[300, 400, 50], [250]], type=pa.int32())
arrow_dataset = lgb.Dataset(
arrow_table, label=arrow_labels, weight=arrow_weights, group=arrow_groups
)
arrow_dataset = lgb.Dataset(arrow_table, label=arrow_labels, weight=arrow_weights, group=arrow_groups)
arrow_dataset.construct()
pandas_dataset = lgb.Dataset(
......@@ -171,9 +167,7 @@ def test_dataset_construct_fields_fuzzy():
# Check for equality
for field in ("label", "weight", "group"):
np_assert_array_equal(
arrow_dataset.get_field(field), pandas_dataset.get_field(field), strict=True
)
np_assert_array_equal(arrow_dataset.get_field(field), pandas_dataset.get_field(field), strict=True)
np_assert_array_equal(arrow_dataset.get_label(), pandas_dataset.get_label(), strict=True)
np_assert_array_equal(arrow_dataset.get_weight(), pandas_dataset.get_weight(), strict=True)
......@@ -269,9 +263,7 @@ def test_dataset_construct_groups(array_type, group_data, arrow_type):
],
)
@pytest.mark.parametrize("arrow_type", _INTEGER_TYPES + _FLOAT_TYPES)
def test_dataset_construct_init_scores_array(
array_type: Any, init_score_data: Any, arrow_type: Any
):
def test_dataset_construct_init_scores_array(array_type: Any, init_score_data: Any, arrow_type: Any):
data = generate_dummy_arrow_table()
init_scores = array_type(init_score_data, type=arrow_type)
dataset = lgb.Dataset(data, init_score=init_scores, params=dummy_dataset_params())
......@@ -320,9 +312,7 @@ def assert_equal_predict_arrow_pandas(booster: lgb.Booster, data: pa.Table):
np_assert_array_equal(p_pred_contrib_arrow, p_pred_contrib_pandas, strict=True)
p_first_iter_arrow = booster.predict(data, start_iteration=0, num_iteration=1, raw_score=True)
p_first_iter_pandas = booster.predict(
data.to_pandas(), start_iteration=0, num_iteration=1, raw_score=True
)
p_first_iter_pandas = booster.predict(data.to_pandas(), start_iteration=0, num_iteration=1, raw_score=True)
np_assert_array_equal(p_first_iter_arrow, p_first_iter_pandas, strict=True)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment