"git@developer.sourcefind.cn:tianlh/lightgbm-dcu.git" did not exist on "1bd3d7e3500480525fb3f4443b48edc4053305f8"
Unverified Commit 1b792e71 authored by James Lamb's avatar James Lamb Committed by GitHub
Browse files

[ci] [python-package] enable ruff-format on tests and examples (#6317)

parent b60068c8
...@@ -7,6 +7,12 @@ exclude: | ...@@ -7,6 +7,12 @@ exclude: |
)$ )$
repos: repos:
- repo: https://github.com/pycqa/isort
rev: 5.13.2
hooks:
- id: isort
name: isort (python)
args: ["--settings-path", "python-package/pyproject.toml"]
- repo: https://github.com/astral-sh/ruff-pre-commit - repo: https://github.com/astral-sh/ruff-pre-commit
# Ruff version. # Ruff version.
rev: v0.2.1 rev: v0.2.1
...@@ -14,12 +20,8 @@ repos: ...@@ -14,12 +20,8 @@ repos:
# Run the linter. # Run the linter.
- id: ruff - id: ruff
args: ["--config", "python-package/pyproject.toml"] args: ["--config", "python-package/pyproject.toml"]
types_or: [python, jupyter]
# Run the formatter. # Run the formatter.
- id: ruff-format - id: ruff-format
args: ["--config", "python-package/pyproject.toml"] args: ["--config", "python-package/pyproject.toml"]
- repo: https://github.com/pycqa/isort types_or: [python, jupyter]
rev: 5.13.2
hooks:
- id: isort
name: isort (python)
args: ["--settings-path", "python-package/pyproject.toml"]
...@@ -10,13 +10,13 @@ from sklearn.metrics import roc_auc_score ...@@ -10,13 +10,13 @@ from sklearn.metrics import roc_auc_score
import lightgbm as lgb import lightgbm as lgb
print('Loading data...') print("Loading data...")
# load or create your dataset # load or create your dataset
binary_example_dir = Path(__file__).absolute().parents[1] / 'binary_classification' binary_example_dir = Path(__file__).absolute().parents[1] / "binary_classification"
df_train = pd.read_csv(str(binary_example_dir / 'binary.train'), header=None, sep='\t') df_train = pd.read_csv(str(binary_example_dir / "binary.train"), header=None, sep="\t")
df_test = pd.read_csv(str(binary_example_dir / 'binary.test'), header=None, sep='\t') df_test = pd.read_csv(str(binary_example_dir / "binary.test"), header=None, sep="\t")
W_train = pd.read_csv(str(binary_example_dir / 'binary.train.weight'), header=None)[0] W_train = pd.read_csv(str(binary_example_dir / "binary.train.weight"), header=None)[0]
W_test = pd.read_csv(str(binary_example_dir / 'binary.test.weight'), header=None)[0] W_test = pd.read_csv(str(binary_example_dir / "binary.test.weight"), header=None)[0]
y_train = df_train[0] y_train = df_train[0]
y_test = df_test[0] y_test = df_test[0]
...@@ -27,72 +27,72 @@ num_train, num_feature = X_train.shape ...@@ -27,72 +27,72 @@ num_train, num_feature = X_train.shape
# create dataset for lightgbm # create dataset for lightgbm
# if you want to re-use data, remember to set free_raw_data=False # if you want to re-use data, remember to set free_raw_data=False
lgb_train = lgb.Dataset(X_train, y_train, lgb_train = lgb.Dataset(X_train, y_train, weight=W_train, free_raw_data=False)
weight=W_train, free_raw_data=False) lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train, weight=W_test, free_raw_data=False)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train,
weight=W_test, free_raw_data=False)
# specify your configurations as a dict # specify your configurations as a dict
params = { params = {
'boosting_type': 'gbdt', "boosting_type": "gbdt",
'objective': 'binary', "objective": "binary",
'metric': 'binary_logloss', "metric": "binary_logloss",
'num_leaves': 31, "num_leaves": 31,
'learning_rate': 0.05, "learning_rate": 0.05,
'feature_fraction': 0.9, "feature_fraction": 0.9,
'bagging_fraction': 0.8, "bagging_fraction": 0.8,
'bagging_freq': 5, "bagging_freq": 5,
'verbose': 0 "verbose": 0,
} }
# generate feature names # generate feature names
feature_name = [f'feature_{col}' for col in range(num_feature)] feature_name = [f"feature_{col}" for col in range(num_feature)]
print('Starting training...') print("Starting training...")
# feature_name and categorical_feature # feature_name and categorical_feature
gbm = lgb.train(params, gbm = lgb.train(
lgb_train, params,
num_boost_round=10, lgb_train,
valid_sets=lgb_train, # eval training data num_boost_round=10,
feature_name=feature_name, valid_sets=lgb_train, # eval training data
categorical_feature=[21]) feature_name=feature_name,
categorical_feature=[21],
print('Finished first 10 rounds...') )
print("Finished first 10 rounds...")
# check feature name # check feature name
print(f'7th feature name is: {lgb_train.feature_name[6]}') print(f"7th feature name is: {lgb_train.feature_name[6]}")
print('Saving model...') print("Saving model...")
# save model to file # save model to file
gbm.save_model('model.txt') gbm.save_model("model.txt")
print('Dumping model to JSON...') print("Dumping model to JSON...")
# dump model to JSON (and save to file) # dump model to JSON (and save to file)
model_json = gbm.dump_model() model_json = gbm.dump_model()
with open('model.json', 'w+') as f: with open("model.json", "w+") as f:
json.dump(model_json, f, indent=4) json.dump(model_json, f, indent=4)
# feature names # feature names
print(f'Feature names: {gbm.feature_name()}') print(f"Feature names: {gbm.feature_name()}")
# feature importances # feature importances
print(f'Feature importances: {list(gbm.feature_importance())}') print(f"Feature importances: {list(gbm.feature_importance())}")
print('Loading model to predict...') print("Loading model to predict...")
# load model to predict # load model to predict
bst = lgb.Booster(model_file='model.txt') bst = lgb.Booster(model_file="model.txt")
# can only predict with the best iteration (or the saving iteration) # can only predict with the best iteration (or the saving iteration)
y_pred = bst.predict(X_test) y_pred = bst.predict(X_test)
# eval with loaded model # eval with loaded model
auc_loaded_model = roc_auc_score(y_test, y_pred) auc_loaded_model = roc_auc_score(y_test, y_pred)
print(f"The ROC AUC of loaded model's prediction is: {auc_loaded_model}") print(f"The ROC AUC of loaded model's prediction is: {auc_loaded_model}")
print('Dumping and loading model with pickle...') print("Dumping and loading model with pickle...")
# dump model with pickle # dump model with pickle
with open('model.pkl', 'wb') as fout: with open("model.pkl", "wb") as fout:
pickle.dump(gbm, fout) pickle.dump(gbm, fout)
# load model with pickle to predict # load model with pickle to predict
with open('model.pkl', 'rb') as fin: with open("model.pkl", "rb") as fin:
pkl_bst = pickle.load(fin) pkl_bst = pickle.load(fin)
# can predict with any iteration when loaded in pickle way # can predict with any iteration when loaded in pickle way
y_pred = pkl_bst.predict(X_test, num_iteration=7) y_pred = pkl_bst.predict(X_test, num_iteration=7)
...@@ -104,36 +104,36 @@ print(f"The ROC AUC of pickled model's prediction is: {auc_pickled_model}") ...@@ -104,36 +104,36 @@ print(f"The ROC AUC of pickled model's prediction is: {auc_pickled_model}")
# init_model accepts: # init_model accepts:
# 1. model file name # 1. model file name
# 2. Booster() # 2. Booster()
gbm = lgb.train(params, gbm = lgb.train(params, lgb_train, num_boost_round=10, init_model="model.txt", valid_sets=lgb_eval)
lgb_train,
num_boost_round=10,
init_model='model.txt',
valid_sets=lgb_eval)
print('Finished 10 - 20 rounds with model file...') print("Finished 10 - 20 rounds with model file...")
# decay learning rates # decay learning rates
# reset_parameter callback accepts: # reset_parameter callback accepts:
# 1. list with length = num_boost_round # 1. list with length = num_boost_round
# 2. function(curr_iter) # 2. function(curr_iter)
gbm = lgb.train(params, gbm = lgb.train(
lgb_train, params,
num_boost_round=10, lgb_train,
init_model=gbm, num_boost_round=10,
valid_sets=lgb_eval, init_model=gbm,
callbacks=[lgb.reset_parameter(learning_rate=lambda iter: 0.05 * (0.99 ** iter))]) valid_sets=lgb_eval,
callbacks=[lgb.reset_parameter(learning_rate=lambda iter: 0.05 * (0.99**iter))],
)
print('Finished 20 - 30 rounds with decay learning rates...') print("Finished 20 - 30 rounds with decay learning rates...")
# change other parameters during training # change other parameters during training
gbm = lgb.train(params, gbm = lgb.train(
lgb_train, params,
num_boost_round=10, lgb_train,
init_model=gbm, num_boost_round=10,
valid_sets=lgb_eval, init_model=gbm,
callbacks=[lgb.reset_parameter(bagging_fraction=[0.7] * 5 + [0.6] * 5)]) valid_sets=lgb_eval,
callbacks=[lgb.reset_parameter(bagging_fraction=[0.7] * 5 + [0.6] * 5)],
)
print('Finished 30 - 40 rounds with changing bagging_fraction...') print("Finished 30 - 40 rounds with changing bagging_fraction...")
# self-defined objective function # self-defined objective function
...@@ -141,9 +141,9 @@ print('Finished 30 - 40 rounds with changing bagging_fraction...') ...@@ -141,9 +141,9 @@ print('Finished 30 - 40 rounds with changing bagging_fraction...')
# log likelihood loss # log likelihood loss
def loglikelihood(preds, train_data): def loglikelihood(preds, train_data):
labels = train_data.get_label() labels = train_data.get_label()
preds = 1. / (1. + np.exp(-preds)) preds = 1.0 / (1.0 + np.exp(-preds))
grad = preds - labels grad = preds - labels
hess = preds * (1. - preds) hess = preds * (1.0 - preds)
return grad, hess return grad, hess
...@@ -156,22 +156,19 @@ def loglikelihood(preds, train_data): ...@@ -156,22 +156,19 @@ def loglikelihood(preds, train_data):
# Keep this in mind when you use the customization # Keep this in mind when you use the customization
def binary_error(preds, train_data): def binary_error(preds, train_data):
labels = train_data.get_label() labels = train_data.get_label()
preds = 1. / (1. + np.exp(-preds)) preds = 1.0 / (1.0 + np.exp(-preds))
return 'error', np.mean(labels != (preds > 0.5)), False return "error", np.mean(labels != (preds > 0.5)), False
# Pass custom objective function through params # Pass custom objective function through params
params_custom_obj = copy.deepcopy(params) params_custom_obj = copy.deepcopy(params)
params_custom_obj['objective'] = loglikelihood params_custom_obj["objective"] = loglikelihood
gbm = lgb.train(params_custom_obj, gbm = lgb.train(
lgb_train, params_custom_obj, lgb_train, num_boost_round=10, init_model=gbm, feval=binary_error, valid_sets=lgb_eval
num_boost_round=10, )
init_model=gbm,
feval=binary_error,
valid_sets=lgb_eval)
print('Finished 40 - 50 rounds with self-defined objective function and eval metric...') print("Finished 40 - 50 rounds with self-defined objective function and eval metric...")
# another self-defined eval metric # another self-defined eval metric
...@@ -183,24 +180,26 @@ print('Finished 40 - 50 rounds with self-defined objective function and eval met ...@@ -183,24 +180,26 @@ print('Finished 40 - 50 rounds with self-defined objective function and eval met
# Keep this in mind when you use the customization # Keep this in mind when you use the customization
def accuracy(preds, train_data): def accuracy(preds, train_data):
labels = train_data.get_label() labels = train_data.get_label()
preds = 1. / (1. + np.exp(-preds)) preds = 1.0 / (1.0 + np.exp(-preds))
return 'accuracy', np.mean(labels == (preds > 0.5)), True return "accuracy", np.mean(labels == (preds > 0.5)), True
# Pass custom objective function through params # Pass custom objective function through params
params_custom_obj = copy.deepcopy(params) params_custom_obj = copy.deepcopy(params)
params_custom_obj['objective'] = loglikelihood params_custom_obj["objective"] = loglikelihood
gbm = lgb.train(params_custom_obj, gbm = lgb.train(
lgb_train, params_custom_obj,
num_boost_round=10, lgb_train,
init_model=gbm, num_boost_round=10,
feval=[binary_error, accuracy], init_model=gbm,
valid_sets=lgb_eval) feval=[binary_error, accuracy],
valid_sets=lgb_eval,
)
print('Finished 50 - 60 rounds with self-defined objective function and multiple self-defined eval metrics...') print("Finished 50 - 60 rounds with self-defined objective function and multiple self-defined eval metrics...")
print('Starting a new training job...') print("Starting a new training job...")
# callback # callback
...@@ -208,17 +207,14 @@ def reset_metrics(): ...@@ -208,17 +207,14 @@ def reset_metrics():
def callback(env): def callback(env):
lgb_eval_new = lgb.Dataset(X_test, y_test, reference=lgb_train) lgb_eval_new = lgb.Dataset(X_test, y_test, reference=lgb_train)
if env.iteration - env.begin_iteration == 5: if env.iteration - env.begin_iteration == 5:
print('Add a new valid dataset at iteration 5...') print("Add a new valid dataset at iteration 5...")
env.model.add_valid(lgb_eval_new, 'new_valid') env.model.add_valid(lgb_eval_new, "new_valid")
callback.before_iteration = True callback.before_iteration = True
callback.order = 0 callback.order = 0
return callback return callback
gbm = lgb.train(params, gbm = lgb.train(params, lgb_train, num_boost_round=10, valid_sets=lgb_train, callbacks=[reset_metrics()])
lgb_train,
num_boost_round=10,
valid_sets=lgb_train,
callbacks=[reset_metrics()])
print('Finished first 10 rounds with callback function...') print("Finished first 10 rounds with callback function...")
...@@ -10,9 +10,9 @@ import lightgbm as lgb ...@@ -10,9 +10,9 @@ import lightgbm as lgb
if __name__ == "__main__": if __name__ == "__main__":
print("loading data") print("loading data")
rank_example_dir = Path(__file__).absolute().parents[2] / 'lambdarank' rank_example_dir = Path(__file__).absolute().parents[2] / "lambdarank"
X, y = load_svmlight_file(str(rank_example_dir / 'rank.train')) X, y = load_svmlight_file(str(rank_example_dir / "rank.train"))
group = np.loadtxt(str(rank_example_dir / 'rank.train.query')) group = np.loadtxt(str(rank_example_dir / "rank.train.query"))
print("initializing a Dask cluster") print("initializing a Dask cluster")
...@@ -32,25 +32,14 @@ if __name__ == "__main__": ...@@ -32,25 +32,14 @@ if __name__ == "__main__":
# a sparse boundary to partition the data # a sparse boundary to partition the data
X = X.toarray() X = X.toarray()
dX = da.from_array( dX = da.from_array(x=X, chunks=[(rows_in_part1, rows_in_part2), (num_features,)])
x=X,
chunks=[
(rows_in_part1, rows_in_part2),
(num_features,)
]
)
dy = da.from_array( dy = da.from_array(
x=y, x=y,
chunks=[ chunks=[
(rows_in_part1, rows_in_part2), (rows_in_part1, rows_in_part2),
] ],
)
dg = da.from_array(
x=group,
chunks=[
(100, group.size - 100)
]
) )
dg = da.from_array(x=group, chunks=[(100, group.size - 100)])
print("beginning training") print("beginning training")
......
...@@ -34,13 +34,13 @@ def create_dataset_from_multiple_hdf(input_flist, batch_size): ...@@ -34,13 +34,13 @@ def create_dataset_from_multiple_hdf(input_flist, batch_size):
data = [] data = []
ylist = [] ylist = []
for f in input_flist: for f in input_flist:
f = h5py.File(f, 'r') f = h5py.File(f, "r")
data.append(HDFSequence(f['X'], batch_size)) data.append(HDFSequence(f["X"], batch_size))
ylist.append(f['Y'][:]) ylist.append(f["Y"][:])
params = { params = {
'bin_construct_sample_cnt': 200000, "bin_construct_sample_cnt": 200000,
'max_bin': 255, "max_bin": 255,
} }
y = np.concatenate(ylist) y = np.concatenate(ylist)
dataset = lgb.Dataset(data, label=y, params=params) dataset = lgb.Dataset(data, label=y, params=params)
...@@ -51,7 +51,7 @@ def create_dataset_from_multiple_hdf(input_flist, batch_size): ...@@ -51,7 +51,7 @@ def create_dataset_from_multiple_hdf(input_flist, batch_size):
# The reason is that DataFrame column names will be used in Dataset. For a DataFrame with Int64Index # The reason is that DataFrame column names will be used in Dataset. For a DataFrame with Int64Index
# as columns, Dataset will use column names like ["0", "1", "2", ...]. While for numpy array, column names # as columns, Dataset will use column names like ["0", "1", "2", ...]. While for numpy array, column names
# are using the default one assigned in C++ code (dataset_loader.cpp), like ["Column_0", "Column_1", ...]. # are using the default one assigned in C++ code (dataset_loader.cpp), like ["Column_0", "Column_1", ...].
dataset.save_binary('regression.train.from_hdf.bin') dataset.save_binary("regression.train.from_hdf.bin")
def save2hdf(input_data, fname, batch_size): def save2hdf(input_data, fname, batch_size):
...@@ -59,7 +59,7 @@ def save2hdf(input_data, fname, batch_size): ...@@ -59,7 +59,7 @@ def save2hdf(input_data, fname, batch_size):
Please note chunk size settings in the implementation for I/O performance optimization. Please note chunk size settings in the implementation for I/O performance optimization.
""" """
with h5py.File(fname, 'w') as f: with h5py.File(fname, "w") as f:
for name, data in input_data.items(): for name, data in input_data.items():
nrow, ncol = data.shape nrow, ncol = data.shape
if ncol == 1: if ncol == 1:
...@@ -75,12 +75,12 @@ def save2hdf(input_data, fname, batch_size): ...@@ -75,12 +75,12 @@ def save2hdf(input_data, fname, batch_size):
# Also note that the data is stored in row major order to avoid extra copy when passing to # Also note that the data is stored in row major order to avoid extra copy when passing to
# lightgbm Dataset. # lightgbm Dataset.
chunk = (batch_size, ncol) chunk = (batch_size, ncol)
f.create_dataset(name, data=data, chunks=chunk, compression='lzf') f.create_dataset(name, data=data, chunks=chunk, compression="lzf")
def generate_hdf(input_fname, output_basename, batch_size): def generate_hdf(input_fname, output_basename, batch_size):
# Save to 2 HDF5 files for demonstration. # Save to 2 HDF5 files for demonstration.
df = pd.read_csv(input_fname, header=None, sep='\t') df = pd.read_csv(input_fname, header=None, sep="\t")
mid = len(df) // 2 mid = len(df) // 2
df1 = df.iloc[:mid] df1 = df.iloc[:mid]
...@@ -88,25 +88,23 @@ def generate_hdf(input_fname, output_basename, batch_size): ...@@ -88,25 +88,23 @@ def generate_hdf(input_fname, output_basename, batch_size):
# We can store multiple datasets inside a single HDF5 file. # We can store multiple datasets inside a single HDF5 file.
# Separating X and Y for choosing best chunk size for data loading. # Separating X and Y for choosing best chunk size for data loading.
fname1 = f'{output_basename}1.h5' fname1 = f"{output_basename}1.h5"
fname2 = f'{output_basename}2.h5' fname2 = f"{output_basename}2.h5"
save2hdf({'Y': df1.iloc[:, :1], 'X': df1.iloc[:, 1:]}, fname1, batch_size) save2hdf({"Y": df1.iloc[:, :1], "X": df1.iloc[:, 1:]}, fname1, batch_size)
save2hdf({'Y': df2.iloc[:, :1], 'X': df2.iloc[:, 1:]}, fname2, batch_size) save2hdf({"Y": df2.iloc[:, :1], "X": df2.iloc[:, 1:]}, fname2, batch_size)
return [fname1, fname2] return [fname1, fname2]
def main(): def main():
batch_size = 64 batch_size = 64
output_basename = 'regression' output_basename = "regression"
hdf_files = generate_hdf( hdf_files = generate_hdf(
str(Path(__file__).absolute().parents[1] / 'regression' / 'regression.train'), str(Path(__file__).absolute().parents[1] / "regression" / "regression.train"), output_basename, batch_size
output_basename,
batch_size
) )
create_dataset_from_multiple_hdf(hdf_files, batch_size=batch_size) create_dataset_from_multiple_hdf(hdf_files, batch_size=batch_size)
if __name__ == '__main__': if __name__ == "__main__":
main() main()
...@@ -24,23 +24,19 @@ import lightgbm as lgb ...@@ -24,23 +24,19 @@ import lightgbm as lgb
# single continuous predictor # single continuous predictor
np.random.seed(0) np.random.seed(0)
N = 1000 N = 1000
X = pd.DataFrame({ X = pd.DataFrame({"continuous": range(N), "categorical": np.repeat([0, 1, 2, 3, 4], N / 5)})
'continuous': range(N),
'categorical': np.repeat([0, 1, 2, 3, 4], N / 5)
})
CATEGORICAL_EFFECTS = [-1, -1, -2, -2, 2] CATEGORICAL_EFFECTS = [-1, -1, -2, -2, 2]
LINEAR_TERM = np.array([ LINEAR_TERM = np.array(
-0.5 + 0.01 * X['continuous'][k] [-0.5 + 0.01 * X["continuous"][k] + CATEGORICAL_EFFECTS[X["categorical"][k]] for k in range(X.shape[0])]
+ CATEGORICAL_EFFECTS[X['categorical'][k]] for k in range(X.shape[0]) ) + np.random.normal(0, 1, X.shape[0])
]) + np.random.normal(0, 1, X.shape[0])
TRUE_PROB = expit(LINEAR_TERM) TRUE_PROB = expit(LINEAR_TERM)
Y = np.random.binomial(1, TRUE_PROB, size=N) Y = np.random.binomial(1, TRUE_PROB, size=N)
DATA = { DATA = {
'X': X, "X": X,
'probability_labels': TRUE_PROB, "probability_labels": TRUE_PROB,
'binary_labels': Y, "binary_labels": Y,
'lgb_with_binary_labels': lgb.Dataset(X, Y), "lgb_with_binary_labels": lgb.Dataset(X, Y),
'lgb_with_probability_labels': lgb.Dataset(X, TRUE_PROB), "lgb_with_probability_labels": lgb.Dataset(X, TRUE_PROB),
} }
...@@ -72,34 +68,25 @@ def experiment(objective, label_type, data): ...@@ -72,34 +68,25 @@ def experiment(objective, label_type, data):
np.random.seed(0) np.random.seed(0)
nrounds = 5 nrounds = 5
lgb_data = data[f"lgb_with_{label_type}_labels"] lgb_data = data[f"lgb_with_{label_type}_labels"]
params = { params = {"objective": objective, "feature_fraction": 1, "bagging_fraction": 1, "verbose": -1}
'objective': objective,
'feature_fraction': 1,
'bagging_fraction': 1,
'verbose': -1
}
time_zero = time.time() time_zero = time.time()
gbm = lgb.train(params, lgb_data, num_boost_round=nrounds) gbm = lgb.train(params, lgb_data, num_boost_round=nrounds)
y_fitted = gbm.predict(data['X']) y_fitted = gbm.predict(data["X"])
y_true = data[f"{label_type}_labels"] y_true = data[f"{label_type}_labels"]
duration = time.time() - time_zero duration = time.time() - time_zero
return { return {"time": duration, "correlation": np.corrcoef(y_fitted, y_true)[0, 1], "logloss": log_loss(y_fitted, y_true)}
'time': duration,
'correlation': np.corrcoef(y_fitted, y_true)[0, 1],
'logloss': log_loss(y_fitted, y_true)
}
################# #################
# Observe the behavior of `binary` and `xentropy` objectives # Observe the behavior of `binary` and `xentropy` objectives
print('Performance of `binary` objective with binary labels:') print("Performance of `binary` objective with binary labels:")
print(experiment('binary', label_type='binary', data=DATA)) print(experiment("binary", label_type="binary", data=DATA))
print('Performance of `xentropy` objective with binary labels:') print("Performance of `xentropy` objective with binary labels:")
print(experiment('xentropy', label_type='binary', data=DATA)) print(experiment("xentropy", label_type="binary", data=DATA))
print('Performance of `xentropy` objective with probability labels:') print("Performance of `xentropy` objective with probability labels:")
print(experiment('xentropy', label_type='probability', data=DATA)) print(experiment("xentropy", label_type="probability", data=DATA))
# Trying this throws an error on non-binary values of y: # Trying this throws an error on non-binary values of y:
# experiment('binary', label_type='probability', DATA) # experiment('binary', label_type='probability', DATA)
...@@ -109,9 +96,7 @@ print(experiment('xentropy', label_type='probability', data=DATA)) ...@@ -109,9 +96,7 @@ print(experiment('xentropy', label_type='probability', data=DATA))
# there are reasons to suspect that `binary` should run faster when the # there are reasons to suspect that `binary` should run faster when the
# label is an integer instead of a float # label is an integer instead of a float
K = 10 K = 10
A = [experiment('binary', label_type='binary', data=DATA)['time'] A = [experiment("binary", label_type="binary", data=DATA)["time"] for k in range(K)]
for k in range(K)] B = [experiment("xentropy", label_type="binary", data=DATA)["time"] for k in range(K)]
B = [experiment('xentropy', label_type='binary', data=DATA)['time']
for k in range(K)]
print(f"Best `binary` time: {min(A)}") print(f"Best `binary` time: {min(A)}")
print(f"Best `xentropy` time: {min(B)}") print(f"Best `xentropy` time: {min(B)}")
This source diff could not be displayed because it is too large. You can view the blob instead.
...@@ -8,13 +8,13 @@ import lightgbm as lgb ...@@ -8,13 +8,13 @@ import lightgbm as lgb
if lgb.compat.MATPLOTLIB_INSTALLED: if lgb.compat.MATPLOTLIB_INSTALLED:
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
else: else:
raise ImportError('You need to install matplotlib and restart your session for plot_example.py.') raise ImportError("You need to install matplotlib and restart your session for plot_example.py.")
print('Loading data...') print("Loading data...")
# load or create your dataset # load or create your dataset
regression_example_dir = Path(__file__).absolute().parents[1] / 'regression' regression_example_dir = Path(__file__).absolute().parents[1] / "regression"
df_train = pd.read_csv(str(regression_example_dir / 'regression.train'), header=None, sep='\t') df_train = pd.read_csv(str(regression_example_dir / "regression.train"), header=None, sep="\t")
df_test = pd.read_csv(str(regression_example_dir / 'regression.test'), header=None, sep='\t') df_test = pd.read_csv(str(regression_example_dir / "regression.test"), header=None, sep="\t")
y_train = df_train[0] y_train = df_train[0]
y_test = df_test[0] y_test = df_test[0]
...@@ -26,45 +26,38 @@ lgb_train = lgb.Dataset(X_train, y_train) ...@@ -26,45 +26,38 @@ lgb_train = lgb.Dataset(X_train, y_train)
lgb_test = lgb.Dataset(X_test, y_test, reference=lgb_train) lgb_test = lgb.Dataset(X_test, y_test, reference=lgb_train)
# specify your configurations as a dict # specify your configurations as a dict
params = { params = {"num_leaves": 5, "metric": ("l1", "l2"), "verbose": 0}
'num_leaves': 5,
'metric': ('l1', 'l2'),
'verbose': 0
}
evals_result = {} # to record eval results for plotting evals_result = {} # to record eval results for plotting
print('Starting training...') print("Starting training...")
# train # train
gbm = lgb.train( gbm = lgb.train(
params, params,
lgb_train, lgb_train,
num_boost_round=100, num_boost_round=100,
valid_sets=[lgb_train, lgb_test], valid_sets=[lgb_train, lgb_test],
feature_name=[f'f{i + 1}' for i in range(X_train.shape[-1])], feature_name=[f"f{i + 1}" for i in range(X_train.shape[-1])],
categorical_feature=[21], categorical_feature=[21],
callbacks=[ callbacks=[lgb.log_evaluation(10), lgb.record_evaluation(evals_result)],
lgb.log_evaluation(10),
lgb.record_evaluation(evals_result)
]
) )
print('Plotting metrics recorded during training...') print("Plotting metrics recorded during training...")
ax = lgb.plot_metric(evals_result, metric='l1') ax = lgb.plot_metric(evals_result, metric="l1")
plt.show() plt.show()
print('Plotting feature importances...') print("Plotting feature importances...")
ax = lgb.plot_importance(gbm, max_num_features=10) ax = lgb.plot_importance(gbm, max_num_features=10)
plt.show() plt.show()
print('Plotting split value histogram...') print("Plotting split value histogram...")
ax = lgb.plot_split_value_histogram(gbm, feature='f26', bins='auto') ax = lgb.plot_split_value_histogram(gbm, feature="f26", bins="auto")
plt.show() plt.show()
print('Plotting 54th tree...') # one tree use categorical feature to split print("Plotting 54th tree...") # one tree use categorical feature to split
ax = lgb.plot_tree(gbm, tree_index=53, figsize=(15, 15), show_info=['split_gain']) ax = lgb.plot_tree(gbm, tree_index=53, figsize=(15, 15), show_info=["split_gain"])
plt.show() plt.show()
print('Plotting 54th tree with graphviz...') print("Plotting 54th tree with graphviz...")
graph = lgb.create_tree_digraph(gbm, tree_index=53, name='Tree54') graph = lgb.create_tree_digraph(gbm, tree_index=53, name="Tree54")
graph.render(view=True) graph.render(view=True)
...@@ -6,11 +6,11 @@ from sklearn.metrics import mean_squared_error ...@@ -6,11 +6,11 @@ from sklearn.metrics import mean_squared_error
import lightgbm as lgb import lightgbm as lgb
print('Loading data...') print("Loading data...")
# load or create your dataset # load or create your dataset
regression_example_dir = Path(__file__).absolute().parents[1] / 'regression' regression_example_dir = Path(__file__).absolute().parents[1] / "regression"
df_train = pd.read_csv(str(regression_example_dir / 'regression.train'), header=None, sep='\t') df_train = pd.read_csv(str(regression_example_dir / "regression.train"), header=None, sep="\t")
df_test = pd.read_csv(str(regression_example_dir / 'regression.test'), header=None, sep='\t') df_test = pd.read_csv(str(regression_example_dir / "regression.test"), header=None, sep="\t")
y_train = df_train[0] y_train = df_train[0]
y_test = df_test[0] y_test = df_test[0]
...@@ -23,32 +23,30 @@ lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train) ...@@ -23,32 +23,30 @@ lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
# specify your configurations as a dict # specify your configurations as a dict
params = { params = {
'boosting_type': 'gbdt', "boosting_type": "gbdt",
'objective': 'regression', "objective": "regression",
'metric': {'l2', 'l1'}, "metric": {"l2", "l1"},
'num_leaves': 31, "num_leaves": 31,
'learning_rate': 0.05, "learning_rate": 0.05,
'feature_fraction': 0.9, "feature_fraction": 0.9,
'bagging_fraction': 0.8, "bagging_fraction": 0.8,
'bagging_freq': 5, "bagging_freq": 5,
'verbose': 0 "verbose": 0,
} }
print('Starting training...') print("Starting training...")
# train # train
gbm = lgb.train(params, gbm = lgb.train(
lgb_train, params, lgb_train, num_boost_round=20, valid_sets=lgb_eval, callbacks=[lgb.early_stopping(stopping_rounds=5)]
num_boost_round=20, )
valid_sets=lgb_eval,
callbacks=[lgb.early_stopping(stopping_rounds=5)])
print('Saving model...') print("Saving model...")
# save model to file # save model to file
gbm.save_model('model.txt') gbm.save_model("model.txt")
print('Starting predicting...') print("Starting predicting...")
# predict # predict
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration) y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
# eval # eval
rmse_test = mean_squared_error(y_test, y_pred) ** 0.5 rmse_test = mean_squared_error(y_test, y_pred) ** 0.5
print(f'The RMSE of prediction is: {rmse_test}') print(f"The RMSE of prediction is: {rmse_test}")
...@@ -8,85 +8,71 @@ from sklearn.model_selection import GridSearchCV ...@@ -8,85 +8,71 @@ from sklearn.model_selection import GridSearchCV
import lightgbm as lgb import lightgbm as lgb
print('Loading data...') print("Loading data...")
# load or create your dataset # load or create your dataset
regression_example_dir = Path(__file__).absolute().parents[1] / 'regression' regression_example_dir = Path(__file__).absolute().parents[1] / "regression"
df_train = pd.read_csv(str(regression_example_dir / 'regression.train'), header=None, sep='\t') df_train = pd.read_csv(str(regression_example_dir / "regression.train"), header=None, sep="\t")
df_test = pd.read_csv(str(regression_example_dir / 'regression.test'), header=None, sep='\t') df_test = pd.read_csv(str(regression_example_dir / "regression.test"), header=None, sep="\t")
y_train = df_train[0] y_train = df_train[0]
y_test = df_test[0] y_test = df_test[0]
X_train = df_train.drop(0, axis=1) X_train = df_train.drop(0, axis=1)
X_test = df_test.drop(0, axis=1) X_test = df_test.drop(0, axis=1)
print('Starting training...') print("Starting training...")
# train # train
gbm = lgb.LGBMRegressor(num_leaves=31, gbm = lgb.LGBMRegressor(num_leaves=31, learning_rate=0.05, n_estimators=20)
learning_rate=0.05, gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], eval_metric="l1", callbacks=[lgb.early_stopping(5)])
n_estimators=20)
gbm.fit(X_train, y_train, print("Starting predicting...")
eval_set=[(X_test, y_test)],
eval_metric='l1',
callbacks=[lgb.early_stopping(5)])
print('Starting predicting...')
# predict # predict
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration_) y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration_)
# eval # eval
rmse_test = mean_squared_error(y_test, y_pred) ** 0.5 rmse_test = mean_squared_error(y_test, y_pred) ** 0.5
print(f'The RMSE of prediction is: {rmse_test}') print(f"The RMSE of prediction is: {rmse_test}")
# feature importances # feature importances
print(f'Feature importances: {list(gbm.feature_importances_)}') print(f"Feature importances: {list(gbm.feature_importances_)}")
# self-defined eval metric # self-defined eval metric
# f(y_true: array, y_pred: array) -> name: str, eval_result: float, is_higher_better: bool # f(y_true: array, y_pred: array) -> name: str, eval_result: float, is_higher_better: bool
# Root Mean Squared Logarithmic Error (RMSLE) # Root Mean Squared Logarithmic Error (RMSLE)
def rmsle(y_true, y_pred): def rmsle(y_true, y_pred):
return 'RMSLE', np.sqrt(np.mean(np.power(np.log1p(y_pred) - np.log1p(y_true), 2))), False return "RMSLE", np.sqrt(np.mean(np.power(np.log1p(y_pred) - np.log1p(y_true), 2))), False
print('Starting training with custom eval function...') print("Starting training with custom eval function...")
# train # train
gbm.fit(X_train, y_train, gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], eval_metric=rmsle, callbacks=[lgb.early_stopping(5)])
eval_set=[(X_test, y_test)],
eval_metric=rmsle,
callbacks=[lgb.early_stopping(5)])
# another self-defined eval metric # another self-defined eval metric
# f(y_true: array, y_pred: array) -> name: str, eval_result: float, is_higher_better: bool # f(y_true: array, y_pred: array) -> name: str, eval_result: float, is_higher_better: bool
# Relative Absolute Error (RAE) # Relative Absolute Error (RAE)
def rae(y_true, y_pred): def rae(y_true, y_pred):
return 'RAE', np.sum(np.abs(y_pred - y_true)) / np.sum(np.abs(np.mean(y_true) - y_true)), False return "RAE", np.sum(np.abs(y_pred - y_true)) / np.sum(np.abs(np.mean(y_true) - y_true)), False
print('Starting training with multiple custom eval functions...') print("Starting training with multiple custom eval functions...")
# train # train
gbm.fit(X_train, y_train, gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], eval_metric=[rmsle, rae], callbacks=[lgb.early_stopping(5)])
eval_set=[(X_test, y_test)],
eval_metric=[rmsle, rae],
callbacks=[lgb.early_stopping(5)])
print('Starting predicting...') print("Starting predicting...")
# predict # predict
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration_) y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration_)
# eval # eval
rmsle_test = rmsle(y_test, y_pred)[1] rmsle_test = rmsle(y_test, y_pred)[1]
rae_test = rae(y_test, y_pred)[1] rae_test = rae(y_test, y_pred)[1]
print(f'The RMSLE of prediction is: {rmsle_test}') print(f"The RMSLE of prediction is: {rmsle_test}")
print(f'The RAE of prediction is: {rae_test}') print(f"The RAE of prediction is: {rae_test}")
# other scikit-learn modules # other scikit-learn modules
estimator = lgb.LGBMRegressor(num_leaves=31) estimator = lgb.LGBMRegressor(num_leaves=31)
param_grid = { param_grid = {"learning_rate": [0.01, 0.1, 1], "n_estimators": [20, 40]}
'learning_rate': [0.01, 0.1, 1],
'n_estimators': [20, 40]
}
gbm = GridSearchCV(estimator, param_grid, cv=3) gbm = GridSearchCV(estimator, param_grid, cv=3)
gbm.fit(X_train, y_train) gbm.fit(X_train, y_train)
print(f'Best parameters found by grid search are: {gbm.best_params_}') print(f"Best parameters found by grid search are: {gbm.best_params_}")
...@@ -18,9 +18,23 @@ from typing import TYPE_CHECKING, Any, Callable, Dict, Iterable, List, Optional, ...@@ -18,9 +18,23 @@ from typing import TYPE_CHECKING, Any, Callable, Dict, Iterable, List, Optional,
import numpy as np import numpy as np
import scipy.sparse import scipy.sparse
from .compat import (PANDAS_INSTALLED, PYARROW_INSTALLED, arrow_cffi, arrow_is_floating, arrow_is_integer, concat, from .compat import (
dt_DataTable, pa_Array, pa_chunked_array, pa_ChunkedArray, pa_compute, pa_Table, PANDAS_INSTALLED,
pd_CategoricalDtype, pd_DataFrame, pd_Series) PYARROW_INSTALLED,
arrow_cffi,
arrow_is_floating,
arrow_is_integer,
concat,
dt_DataTable,
pa_Array,
pa_chunked_array,
pa_ChunkedArray,
pa_compute,
pa_Table,
pd_CategoricalDtype,
pd_DataFrame,
pd_Series,
)
from .libpath import find_lib_path from .libpath import find_lib_path
if TYPE_CHECKING: if TYPE_CHECKING:
......
...@@ -5,8 +5,14 @@ from dataclasses import dataclass ...@@ -5,8 +5,14 @@ from dataclasses import dataclass
from functools import partial from functools import partial
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union
from .basic import (Booster, _ConfigAliases, _LGBM_BoosterEvalMethodResultType, from .basic import (
_LGBM_BoosterEvalMethodResultWithStandardDeviationType, _log_info, _log_warning) Booster,
_ConfigAliases,
_LGBM_BoosterEvalMethodResultType,
_LGBM_BoosterEvalMethodResultWithStandardDeviationType,
_log_info,
_log_warning,
)
if TYPE_CHECKING: if TYPE_CHECKING:
from .engine import CVBooster from .engine import CVBooster
......
...@@ -19,12 +19,36 @@ import numpy as np ...@@ -19,12 +19,36 @@ import numpy as np
import scipy.sparse as ss import scipy.sparse as ss
from .basic import LightGBMError, _choose_param_value, _ConfigAliases, _log_info, _log_warning from .basic import LightGBMError, _choose_param_value, _ConfigAliases, _log_info, _log_warning
from .compat import (DASK_INSTALLED, PANDAS_INSTALLED, SKLEARN_INSTALLED, Client, Future, LGBMNotFittedError, concat, from .compat import (
dask_Array, dask_array_from_delayed, dask_bag_from_delayed, dask_DataFrame, dask_Series, DASK_INSTALLED,
default_client, delayed, pd_DataFrame, pd_Series, wait) PANDAS_INSTALLED,
from .sklearn import (LGBMClassifier, LGBMModel, LGBMRanker, LGBMRegressor, _LGBM_ScikitCustomObjectiveFunction, SKLEARN_INSTALLED,
_LGBM_ScikitEvalMetricType, _lgbmmodel_doc_custom_eval_note, _lgbmmodel_doc_fit, Client,
_lgbmmodel_doc_predict) Future,
LGBMNotFittedError,
concat,
dask_Array,
dask_array_from_delayed,
dask_bag_from_delayed,
dask_DataFrame,
dask_Series,
default_client,
delayed,
pd_DataFrame,
pd_Series,
wait,
)
from .sklearn import (
LGBMClassifier,
LGBMModel,
LGBMRanker,
LGBMRegressor,
_LGBM_ScikitCustomObjectiveFunction,
_LGBM_ScikitEvalMetricType,
_lgbmmodel_doc_custom_eval_note,
_lgbmmodel_doc_fit,
_lgbmmodel_doc_predict,
)
__all__ = [ __all__ = [
'DaskLGBMClassifier', 'DaskLGBMClassifier',
......
...@@ -10,10 +10,21 @@ from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union ...@@ -10,10 +10,21 @@ from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
import numpy as np import numpy as np
from . import callback from . import callback
from .basic import (Booster, Dataset, LightGBMError, _choose_param_value, _ConfigAliases, _InnerPredictor, from .basic import (
_LGBM_BoosterEvalMethodResultType, _LGBM_BoosterEvalMethodResultWithStandardDeviationType, Booster,
_LGBM_CategoricalFeatureConfiguration, _LGBM_CustomObjectiveFunction, _LGBM_EvalFunctionResultType, Dataset,
_LGBM_FeatureNameConfiguration, _log_warning) LightGBMError,
_choose_param_value,
_ConfigAliases,
_InnerPredictor,
_LGBM_BoosterEvalMethodResultType,
_LGBM_BoosterEvalMethodResultWithStandardDeviationType,
_LGBM_CategoricalFeatureConfiguration,
_LGBM_CustomObjectiveFunction,
_LGBM_EvalFunctionResultType,
_LGBM_FeatureNameConfiguration,
_log_warning,
)
from .compat import SKLEARN_INSTALLED, _LGBMBaseCrossValidator, _LGBMGroupKFold, _LGBMStratifiedKFold from .compat import SKLEARN_INSTALLED, _LGBMBaseCrossValidator, _LGBMGroupKFold, _LGBMStratifiedKFold
__all__ = [ __all__ = [
......
...@@ -8,14 +8,41 @@ from typing import Any, Callable, Dict, List, Optional, Tuple, Union ...@@ -8,14 +8,41 @@ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
import numpy as np import numpy as np
import scipy.sparse import scipy.sparse
from .basic import (Booster, Dataset, LightGBMError, _choose_param_value, _ConfigAliases, _LGBM_BoosterBestScoreType, from .basic import (
_LGBM_CategoricalFeatureConfiguration, _LGBM_EvalFunctionResultType, _LGBM_FeatureNameConfiguration, Booster,
_LGBM_GroupType, _LGBM_InitScoreType, _LGBM_LabelType, _LGBM_WeightType, _log_warning) Dataset,
LightGBMError,
_choose_param_value,
_ConfigAliases,
_LGBM_BoosterBestScoreType,
_LGBM_CategoricalFeatureConfiguration,
_LGBM_EvalFunctionResultType,
_LGBM_FeatureNameConfiguration,
_LGBM_GroupType,
_LGBM_InitScoreType,
_LGBM_LabelType,
_LGBM_WeightType,
_log_warning,
)
from .callback import _EvalResultDict, record_evaluation from .callback import _EvalResultDict, record_evaluation
from .compat import (SKLEARN_INSTALLED, LGBMNotFittedError, _LGBMAssertAllFinite, _LGBMCheckArray, from .compat import (
_LGBMCheckClassificationTargets, _LGBMCheckSampleWeight, _LGBMCheckXY, _LGBMClassifierBase, SKLEARN_INSTALLED,
_LGBMComputeSampleWeight, _LGBMCpuCount, _LGBMLabelEncoder, _LGBMModelBase, _LGBMRegressorBase, LGBMNotFittedError,
dt_DataTable, np_random_Generator, pd_DataFrame) _LGBMAssertAllFinite,
_LGBMCheckArray,
_LGBMCheckClassificationTargets,
_LGBMCheckSampleWeight,
_LGBMCheckXY,
_LGBMClassifierBase,
_LGBMComputeSampleWeight,
_LGBMCpuCount,
_LGBMLabelEncoder,
_LGBMModelBase,
_LGBMRegressorBase,
dt_DataTable,
np_random_Generator,
pd_DataFrame,
)
from .engine import train from .engine import train
__all__ = [ __all__ = [
......
...@@ -81,10 +81,14 @@ minimum-version = "0.4.4" ...@@ -81,10 +81,14 @@ minimum-version = "0.4.4"
# end:build-system # end:build-system
[tool.isort] [tool.isort]
include_trailing_comma = true
line_length = 120 line_length = 120
# "vertical hanging indent", to match what ruff-format does
# ref: https://pycqa.github.io/isort/docs/configuration/multi_line_output_modes.html#3-vertical-hanging-indent
multi_line_output = 3
skip_glob = [ skip_glob = [
"*/external_libs/*", "*/external_libs/*",
"*/lightgbm-python/*" "*/lightgbm-python/*",
] ]
[tool.mypy] [tool.mypy]
...@@ -108,14 +112,13 @@ docstring-code-format = false ...@@ -108,14 +112,13 @@ docstring-code-format = false
exclude = [ exclude = [
"build/*.py", "build/*.py",
"compile/*.py", "compile/*.py",
"examples/*.py",
"external_libs/*.py", "external_libs/*.py",
"lightgbm-python/*.py", "lightgbm-python/*.py",
"python-package/*.py", "python-package/*.py",
"tests/*.py"
] ]
indent-style = "space" indent-style = "space"
quote-style = "double" quote-style = "double"
skip-magic-trailing-comma = false
[tool.ruff.lint] [tool.ruff.lint]
ignore = [ ignore = [
......
...@@ -10,7 +10,7 @@ try: ...@@ -10,7 +10,7 @@ try:
from lightgbm.basic import _LIB as LIB from lightgbm.basic import _LIB as LIB
except ModuleNotFoundError: except ModuleNotFoundError:
print("Could not import lightgbm Python package, looking for lib_lightgbm at the repo root") print("Could not import lightgbm Python package, looking for lib_lightgbm at the repo root")
if system() in ('Windows', 'Microsoft'): if system() in ("Windows", "Microsoft"):
lib_file = Path(__file__).absolute().parents[2] / "Release" / "lib_lightgbm.dll" lib_file = Path(__file__).absolute().parents[2] / "Release" / "lib_lightgbm.dll"
else: else:
lib_file = Path(__file__).absolute().parents[2] / "lib_lightgbm.so" lib_file = Path(__file__).absolute().parents[2] / "lib_lightgbm.so"
...@@ -25,7 +25,7 @@ dtype_int64 = 3 ...@@ -25,7 +25,7 @@ dtype_int64 = 3
def c_str(string): def c_str(string):
return ctypes.c_char_p(string.encode('utf-8')) return ctypes.c_char_p(string.encode("utf-8"))
def load_from_file(filename, reference): def load_from_file(filename, reference):
...@@ -33,17 +33,13 @@ def load_from_file(filename, reference): ...@@ -33,17 +33,13 @@ def load_from_file(filename, reference):
if reference is not None: if reference is not None:
ref = reference ref = reference
handle = ctypes.c_void_p() handle = ctypes.c_void_p()
LIB.LGBM_DatasetCreateFromFile( LIB.LGBM_DatasetCreateFromFile(c_str(str(filename)), c_str("max_bin=15"), ref, ctypes.byref(handle))
c_str(str(filename)),
c_str('max_bin=15'),
ref,
ctypes.byref(handle))
print(LIB.LGBM_GetLastError()) print(LIB.LGBM_GetLastError())
num_data = ctypes.c_int(0) num_data = ctypes.c_int(0)
LIB.LGBM_DatasetGetNumData(handle, ctypes.byref(num_data)) LIB.LGBM_DatasetGetNumData(handle, ctypes.byref(num_data))
num_feature = ctypes.c_int(0) num_feature = ctypes.c_int(0)
LIB.LGBM_DatasetGetNumFeature(handle, ctypes.byref(num_feature)) LIB.LGBM_DatasetGetNumFeature(handle, ctypes.byref(num_feature))
print(f'#data: {num_data.value} #feature: {num_feature.value}') print(f"#data: {num_data.value} #feature: {num_feature.value}")
return handle return handle
...@@ -69,20 +65,22 @@ def load_from_csr(filename, reference): ...@@ -69,20 +65,22 @@ def load_from_csr(filename, reference):
ctypes.c_int64(len(csr.indptr)), ctypes.c_int64(len(csr.indptr)),
ctypes.c_int64(len(csr.data)), ctypes.c_int64(len(csr.data)),
ctypes.c_int64(csr.shape[1]), ctypes.c_int64(csr.shape[1]),
c_str('max_bin=15'), c_str("max_bin=15"),
ref, ref,
ctypes.byref(handle)) ctypes.byref(handle),
)
num_data = ctypes.c_int(0) num_data = ctypes.c_int(0)
LIB.LGBM_DatasetGetNumData(handle, ctypes.byref(num_data)) LIB.LGBM_DatasetGetNumData(handle, ctypes.byref(num_data))
num_feature = ctypes.c_int(0) num_feature = ctypes.c_int(0)
LIB.LGBM_DatasetGetNumFeature(handle, ctypes.byref(num_feature)) LIB.LGBM_DatasetGetNumFeature(handle, ctypes.byref(num_feature))
LIB.LGBM_DatasetSetField( LIB.LGBM_DatasetSetField(
handle, handle,
c_str('label'), c_str("label"),
label.ctypes.data_as(ctypes.POINTER(ctypes.c_float)), label.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
ctypes.c_int(len(label)), ctypes.c_int(len(label)),
ctypes.c_int(dtype_float32)) ctypes.c_int(dtype_float32),
print(f'#data: {num_data.value} #feature: {num_feature.value}') )
print(f"#data: {num_data.value} #feature: {num_feature.value}")
return handle return handle
...@@ -104,20 +102,22 @@ def load_from_csc(filename, reference): ...@@ -104,20 +102,22 @@ def load_from_csc(filename, reference):
ctypes.c_int64(len(csc.indptr)), ctypes.c_int64(len(csc.indptr)),
ctypes.c_int64(len(csc.data)), ctypes.c_int64(len(csc.data)),
ctypes.c_int64(csc.shape[0]), ctypes.c_int64(csc.shape[0]),
c_str('max_bin=15'), c_str("max_bin=15"),
ref, ref,
ctypes.byref(handle)) ctypes.byref(handle),
)
num_data = ctypes.c_int(0) num_data = ctypes.c_int(0)
LIB.LGBM_DatasetGetNumData(handle, ctypes.byref(num_data)) LIB.LGBM_DatasetGetNumData(handle, ctypes.byref(num_data))
num_feature = ctypes.c_int(0) num_feature = ctypes.c_int(0)
LIB.LGBM_DatasetGetNumFeature(handle, ctypes.byref(num_feature)) LIB.LGBM_DatasetGetNumFeature(handle, ctypes.byref(num_feature))
LIB.LGBM_DatasetSetField( LIB.LGBM_DatasetSetField(
handle, handle,
c_str('label'), c_str("label"),
label.ctypes.data_as(ctypes.POINTER(ctypes.c_float)), label.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
ctypes.c_int(len(label)), ctypes.c_int(len(label)),
ctypes.c_int(dtype_float32)) ctypes.c_int(dtype_float32),
print(f'#data: {num_data.value} #feature: {num_feature.value}') )
print(f"#data: {num_data.value} #feature: {num_feature.value}")
return handle return handle
...@@ -137,20 +137,22 @@ def load_from_mat(filename, reference): ...@@ -137,20 +137,22 @@ def load_from_mat(filename, reference):
ctypes.c_int32(mat.shape[0]), ctypes.c_int32(mat.shape[0]),
ctypes.c_int32(mat.shape[1]), ctypes.c_int32(mat.shape[1]),
ctypes.c_int(1), ctypes.c_int(1),
c_str('max_bin=15'), c_str("max_bin=15"),
ref, ref,
ctypes.byref(handle)) ctypes.byref(handle),
)
num_data = ctypes.c_int(0) num_data = ctypes.c_int(0)
LIB.LGBM_DatasetGetNumData(handle, ctypes.byref(num_data)) LIB.LGBM_DatasetGetNumData(handle, ctypes.byref(num_data))
num_feature = ctypes.c_int(0) num_feature = ctypes.c_int(0)
LIB.LGBM_DatasetGetNumFeature(handle, ctypes.byref(num_feature)) LIB.LGBM_DatasetGetNumFeature(handle, ctypes.byref(num_feature))
LIB.LGBM_DatasetSetField( LIB.LGBM_DatasetSetField(
handle, handle,
c_str('label'), c_str("label"),
label.ctypes.data_as(ctypes.POINTER(ctypes.c_float)), label.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
ctypes.c_int(len(label)), ctypes.c_int(len(label)),
ctypes.c_int(dtype_float32)) ctypes.c_int(dtype_float32),
print(f'#data: {num_data.value} #feature: {num_feature.value}') )
print(f"#data: {num_data.value} #feature: {num_feature.value}")
return handle return handle
...@@ -159,29 +161,26 @@ def free_dataset(handle): ...@@ -159,29 +161,26 @@ def free_dataset(handle):
def test_dataset(): def test_dataset():
binary_example_dir = Path(__file__).absolute().parents[2] / 'examples' / 'binary_classification' binary_example_dir = Path(__file__).absolute().parents[2] / "examples" / "binary_classification"
train = load_from_file(binary_example_dir / 'binary.train', None) train = load_from_file(binary_example_dir / "binary.train", None)
test = load_from_mat(binary_example_dir / 'binary.test', train) test = load_from_mat(binary_example_dir / "binary.test", train)
free_dataset(test) free_dataset(test)
test = load_from_csr(binary_example_dir / 'binary.test', train) test = load_from_csr(binary_example_dir / "binary.test", train)
free_dataset(test) free_dataset(test)
test = load_from_csc(binary_example_dir / 'binary.test', train) test = load_from_csc(binary_example_dir / "binary.test", train)
free_dataset(test) free_dataset(test)
save_to_binary(train, 'train.binary.bin') save_to_binary(train, "train.binary.bin")
free_dataset(train) free_dataset(train)
train = load_from_file('train.binary.bin', None) train = load_from_file("train.binary.bin", None)
free_dataset(train) free_dataset(train)
def test_booster(): def test_booster():
binary_example_dir = Path(__file__).absolute().parents[2] / 'examples' / 'binary_classification' binary_example_dir = Path(__file__).absolute().parents[2] / "examples" / "binary_classification"
train = load_from_mat(binary_example_dir / 'binary.train', None) train = load_from_mat(binary_example_dir / "binary.train", None)
test = load_from_mat(binary_example_dir / 'binary.test', train) test = load_from_mat(binary_example_dir / "binary.test", train)
booster = ctypes.c_void_p() booster = ctypes.c_void_p()
LIB.LGBM_BoosterCreate( LIB.LGBM_BoosterCreate(train, c_str("app=binary metric=auc num_leaves=31 verbose=0"), ctypes.byref(booster))
train,
c_str("app=binary metric=auc num_leaves=31 verbose=0"),
ctypes.byref(booster))
LIB.LGBM_BoosterAddValidData(booster, test) LIB.LGBM_BoosterAddValidData(booster, test)
is_finished = ctypes.c_int(0) is_finished = ctypes.c_int(0)
for i in range(1, 51): for i in range(1, 51):
...@@ -189,28 +188,18 @@ def test_booster(): ...@@ -189,28 +188,18 @@ def test_booster():
result = np.array([0.0], dtype=np.float64) result = np.array([0.0], dtype=np.float64)
out_len = ctypes.c_int(0) out_len = ctypes.c_int(0)
LIB.LGBM_BoosterGetEval( LIB.LGBM_BoosterGetEval(
booster, booster, ctypes.c_int(0), ctypes.byref(out_len), result.ctypes.data_as(ctypes.POINTER(ctypes.c_double))
ctypes.c_int(0), )
ctypes.byref(out_len),
result.ctypes.data_as(ctypes.POINTER(ctypes.c_double)))
if i % 10 == 0: if i % 10 == 0:
print(f'{i} iteration test AUC {result[0]:.6f}') print(f"{i} iteration test AUC {result[0]:.6f}")
LIB.LGBM_BoosterSaveModel( LIB.LGBM_BoosterSaveModel(booster, ctypes.c_int(0), ctypes.c_int(-1), ctypes.c_int(0), c_str("model.txt"))
booster,
ctypes.c_int(0),
ctypes.c_int(-1),
ctypes.c_int(0),
c_str('model.txt'))
LIB.LGBM_BoosterFree(booster) LIB.LGBM_BoosterFree(booster)
free_dataset(train) free_dataset(train)
free_dataset(test) free_dataset(test)
booster2 = ctypes.c_void_p() booster2 = ctypes.c_void_p()
num_total_model = ctypes.c_int(0) num_total_model = ctypes.c_int(0)
LIB.LGBM_BoosterCreateFromModelfile( LIB.LGBM_BoosterCreateFromModelfile(c_str("model.txt"), ctypes.byref(num_total_model), ctypes.byref(booster2))
c_str('model.txt'), data = np.loadtxt(str(binary_example_dir / "binary.test"), dtype=np.float64)
ctypes.byref(num_total_model),
ctypes.byref(booster2))
data = np.loadtxt(str(binary_example_dir / 'binary.test'), dtype=np.float64)
mat = data[:, 1:] mat = data[:, 1:]
preb = np.empty(mat.shape[0], dtype=np.float64) preb = np.empty(mat.shape[0], dtype=np.float64)
num_preb = ctypes.c_int64(0) num_preb = ctypes.c_int64(0)
...@@ -225,58 +214,51 @@ def test_booster(): ...@@ -225,58 +214,51 @@ def test_booster():
ctypes.c_int(1), ctypes.c_int(1),
ctypes.c_int(0), ctypes.c_int(0),
ctypes.c_int(25), ctypes.c_int(25),
c_str(''), c_str(""),
ctypes.byref(num_preb), ctypes.byref(num_preb),
preb.ctypes.data_as(ctypes.POINTER(ctypes.c_double))) preb.ctypes.data_as(ctypes.POINTER(ctypes.c_double)),
)
LIB.LGBM_BoosterPredictForFile( LIB.LGBM_BoosterPredictForFile(
booster2, booster2,
c_str(str(binary_example_dir / 'binary.test')), c_str(str(binary_example_dir / "binary.test")),
ctypes.c_int(0), ctypes.c_int(0),
ctypes.c_int(0), ctypes.c_int(0),
ctypes.c_int(0), ctypes.c_int(0),
ctypes.c_int(25), ctypes.c_int(25),
c_str(''), c_str(""),
c_str('preb.txt')) c_str("preb.txt"),
)
LIB.LGBM_BoosterPredictForFile( LIB.LGBM_BoosterPredictForFile(
booster2, booster2,
c_str(str(binary_example_dir / 'binary.test')), c_str(str(binary_example_dir / "binary.test")),
ctypes.c_int(0), ctypes.c_int(0),
ctypes.c_int(0), ctypes.c_int(0),
ctypes.c_int(10), ctypes.c_int(10),
ctypes.c_int(25), ctypes.c_int(25),
c_str(''), c_str(""),
c_str('preb.txt')) c_str("preb.txt"),
)
LIB.LGBM_BoosterFree(booster2) LIB.LGBM_BoosterFree(booster2)
def test_max_thread_control(): def test_max_thread_control():
# at initialization, should be -1 # at initialization, should be -1
num_threads = ctypes.c_int(0) num_threads = ctypes.c_int(0)
ret = LIB.LGBM_GetMaxThreads( ret = LIB.LGBM_GetMaxThreads(ctypes.byref(num_threads))
ctypes.byref(num_threads)
)
assert ret == 0 assert ret == 0
assert num_threads.value == -1 assert num_threads.value == -1
# updating that value through the C API should work # updating that value through the C API should work
ret = LIB.LGBM_SetMaxThreads( ret = LIB.LGBM_SetMaxThreads(ctypes.c_int(6))
ctypes.c_int(6)
)
assert ret == 0 assert ret == 0
ret = LIB.LGBM_GetMaxThreads( ret = LIB.LGBM_GetMaxThreads(ctypes.byref(num_threads))
ctypes.byref(num_threads)
)
assert ret == 0 assert ret == 0
assert num_threads.value == 6 assert num_threads.value == 6
# resetting to any negative number should set it to -1 # resetting to any negative number should set it to -1
ret = LIB.LGBM_SetMaxThreads( ret = LIB.LGBM_SetMaxThreads(ctypes.c_int(-123))
ctypes.c_int(-123)
)
assert ret == 0 assert ret == 0
ret = LIB.LGBM_GetMaxThreads( ret = LIB.LGBM_GetMaxThreads(ctypes.byref(num_threads))
ctypes.byref(num_threads)
)
assert ret == 0 assert ret == 0
assert num_threads.value == -1 assert num_threads.value == -1
...@@ -3,5 +3,5 @@ from pathlib import Path ...@@ -3,5 +3,5 @@ from pathlib import Path
import numpy as np import numpy as np
preds = [np.loadtxt(str(name)) for name in Path(__file__).absolute().parent.glob('*.pred')] preds = [np.loadtxt(str(name)) for name in Path(__file__).absolute().parent.glob("*.pred")]
np.testing.assert_allclose(preds[0], preds[1]) np.testing.assert_allclose(preds[0], preds[1])
...@@ -14,16 +14,16 @@ from sklearn.metrics import accuracy_score ...@@ -14,16 +14,16 @@ from sklearn.metrics import accuracy_score
TESTS_DIR = Path(__file__).absolute().parent TESTS_DIR = Path(__file__).absolute().parent
@pytest.fixture(scope='module') @pytest.fixture(scope="module")
def executable(pytestconfig) -> str: def executable(pytestconfig) -> str:
"""Returns the path to the lightgbm executable.""" """Returns the path to the lightgbm executable."""
return pytestconfig.getoption('execfile') return pytestconfig.getoption("execfile")
def _find_random_open_port() -> int: def _find_random_open_port() -> int:
"""Find a random open port on localhost.""" """Find a random open port on localhost."""
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
s.bind(('', 0)) s.bind(("", 0))
port = s.getsockname()[1] port = s.getsockname()[1]
return port # noqa: RET504 return port # noqa: RET504
...@@ -34,7 +34,7 @@ def _generate_n_ports(n: int) -> Generator[int, None, None]: ...@@ -34,7 +34,7 @@ def _generate_n_ports(n: int) -> Generator[int, None, None]:
def _write_dict(d: Dict, file: io.TextIOWrapper) -> None: def _write_dict(d: Dict, file: io.TextIOWrapper) -> None:
for k, v in d.items(): for k, v in d.items():
file.write(f'{k} = {v}\n') file.write(f"{k} = {v}\n")
def create_data(task: str, n_samples: int = 1_000) -> np.ndarray: def create_data(task: str, n_samples: int = 1_000) -> np.ndarray:
...@@ -42,10 +42,10 @@ def create_data(task: str, n_samples: int = 1_000) -> np.ndarray: ...@@ -42,10 +42,10 @@ def create_data(task: str, n_samples: int = 1_000) -> np.ndarray:
The data is returned as a numpy array with the label as the first column. The data is returned as a numpy array with the label as the first column.
""" """
if task == 'binary-classification': if task == "binary-classification":
centers = [[-4, -4], [4, 4]] centers = [[-4, -4], [4, 4]]
X, y = make_blobs(n_samples, centers=centers, random_state=42) X, y = make_blobs(n_samples, centers=centers, random_state=42)
elif task == 'regression': elif task == "regression":
X, y = make_regression(n_samples, n_features=4, n_informative=2, random_state=42) X, y = make_regression(n_samples, n_features=4, n_informative=2, random_state=42)
return np.hstack([y.reshape(-1, 1), X]) return np.hstack([y.reshape(-1, 1), X])
...@@ -54,22 +54,22 @@ class DistributedMockup: ...@@ -54,22 +54,22 @@ class DistributedMockup:
"""Simulate distributed training.""" """Simulate distributed training."""
default_train_config = { default_train_config = {
'task': 'train', "task": "train",
'pre_partition': True, "pre_partition": True,
'machine_list_file': TESTS_DIR / 'mlist.txt', "machine_list_file": TESTS_DIR / "mlist.txt",
'tree_learner': 'data', "tree_learner": "data",
'force_row_wise': True, "force_row_wise": True,
'verbose': 0, "verbose": 0,
'num_boost_round': 20, "num_boost_round": 20,
'num_leaves': 15, "num_leaves": 15,
'num_threads': 2, "num_threads": 2,
} }
default_predict_config = { default_predict_config = {
'task': 'predict', "task": "predict",
'data': TESTS_DIR / 'train.txt', "data": TESTS_DIR / "train.txt",
'input_model': TESTS_DIR / 'model0.txt', "input_model": TESTS_DIR / "model0.txt",
'output_result': TESTS_DIR / 'predictions.txt', "output_result": TESTS_DIR / "predictions.txt",
} }
def __init__(self, executable: str): def __init__(self, executable: str):
...@@ -77,8 +77,8 @@ class DistributedMockup: ...@@ -77,8 +77,8 @@ class DistributedMockup:
def worker_train(self, i: int) -> subprocess.CompletedProcess: def worker_train(self, i: int) -> subprocess.CompletedProcess:
"""Start the training process on the `i`-th worker.""" """Start the training process on the `i`-th worker."""
config_path = TESTS_DIR / f'train{i}.conf' config_path = TESTS_DIR / f"train{i}.conf"
cmd = [self.executable, f'config={config_path}'] cmd = [self.executable, f"config={config_path}"]
return subprocess.run(cmd) return subprocess.run(cmd)
def _set_ports(self) -> None: def _set_ports(self) -> None:
...@@ -92,18 +92,18 @@ class DistributedMockup: ...@@ -92,18 +92,18 @@ class DistributedMockup:
ports.update(candidates) ports.update(candidates)
i += 1 i += 1
if i == max_tries: if i == max_tries:
raise RuntimeError('Unable to find non-colliding ports.') raise RuntimeError("Unable to find non-colliding ports.")
self.listen_ports = list(ports) self.listen_ports = list(ports)
with open(TESTS_DIR / 'mlist.txt', 'wt') as f: with open(TESTS_DIR / "mlist.txt", "wt") as f:
for port in self.listen_ports: for port in self.listen_ports:
f.write(f'127.0.0.1 {port}\n') f.write(f"127.0.0.1 {port}\n")
def _write_data(self, partitions: List[np.ndarray]) -> None: def _write_data(self, partitions: List[np.ndarray]) -> None:
"""Write all training data as train.txt and each training partition as train{i}.txt.""" """Write all training data as train.txt and each training partition as train{i}.txt."""
all_data = np.vstack(partitions) all_data = np.vstack(partitions)
np.savetxt(str(TESTS_DIR / 'train.txt'), all_data, delimiter=',') np.savetxt(str(TESTS_DIR / "train.txt"), all_data, delimiter=",")
for i, partition in enumerate(partitions): for i, partition in enumerate(partitions):
np.savetxt(str(TESTS_DIR / f'train{i}.txt'), partition, delimiter=',') np.savetxt(str(TESTS_DIR / f"train{i}.txt"), partition, delimiter=",")
def fit(self, partitions: List[np.ndarray], train_config: Dict) -> None: def fit(self, partitions: List[np.ndarray], train_config: Dict) -> None:
"""Run the distributed training process on a single machine. """Run the distributed training process on a single machine.
...@@ -118,7 +118,7 @@ class DistributedMockup: ...@@ -118,7 +118,7 @@ class DistributedMockup:
""" """
self.train_config = copy.deepcopy(self.default_train_config) self.train_config = copy.deepcopy(self.default_train_config)
self.train_config.update(train_config) self.train_config.update(train_config)
self.n_workers = self.train_config['num_machines'] self.n_workers = self.train_config["num_machines"]
self._set_ports() self._set_ports()
self._write_data(partitions) self._write_data(partitions)
self.label_ = np.hstack([partition[:, 0] for partition in partitions]) self.label_ = np.hstack([partition[:, 0] for partition in partitions])
...@@ -131,7 +131,7 @@ class DistributedMockup: ...@@ -131,7 +131,7 @@ class DistributedMockup:
results = [f.result() for f in futures] results = [f.result() for f in futures]
for result in results: for result in results:
if result.returncode != 0: if result.returncode != 0:
raise RuntimeError('Error in training') raise RuntimeError("Error in training")
def predict(self, predict_config: Dict[str, Any]) -> np.ndarray: def predict(self, predict_config: Dict[str, Any]) -> np.ndarray:
"""Compute the predictions using the model created in the fit step. """Compute the predictions using the model created in the fit step.
...@@ -141,14 +141,14 @@ class DistributedMockup: ...@@ -141,14 +141,14 @@ class DistributedMockup:
""" """
self.predict_config = copy.deepcopy(self.default_predict_config) self.predict_config = copy.deepcopy(self.default_predict_config)
self.predict_config.update(predict_config) self.predict_config.update(predict_config)
config_path = TESTS_DIR / 'predict.conf' config_path = TESTS_DIR / "predict.conf"
with open(config_path, 'wt') as file: with open(config_path, "wt") as file:
_write_dict(self.predict_config, file) _write_dict(self.predict_config, file)
cmd = [self.executable, f'config={config_path}'] cmd = [self.executable, f"config={config_path}"]
result = subprocess.run(cmd) result = subprocess.run(cmd)
if result.returncode != 0: if result.returncode != 0:
raise RuntimeError('Error in prediction') raise RuntimeError("Error in prediction")
return np.loadtxt(str(TESTS_DIR / 'predictions.txt')) return np.loadtxt(str(TESTS_DIR / "predictions.txt"))
def write_train_config(self, i: int) -> None: def write_train_config(self, i: int) -> None:
"""Create a file train{i}.conf with the required configuration to train. """Create a file train{i}.conf with the required configuration to train.
...@@ -156,41 +156,41 @@ class DistributedMockup: ...@@ -156,41 +156,41 @@ class DistributedMockup:
Each worker gets a different port and piece of the data, the rest are the Each worker gets a different port and piece of the data, the rest are the
model parameters contained in `self.config`. model parameters contained in `self.config`.
""" """
with open(TESTS_DIR / f'train{i}.conf', 'wt') as file: with open(TESTS_DIR / f"train{i}.conf", "wt") as file:
output_model = TESTS_DIR / f'model{i}.txt' output_model = TESTS_DIR / f"model{i}.txt"
data = TESTS_DIR / f'train{i}.txt' data = TESTS_DIR / f"train{i}.txt"
file.write(f'output_model = {output_model}\n') file.write(f"output_model = {output_model}\n")
file.write(f'local_listen_port = {self.listen_ports[i]}\n') file.write(f"local_listen_port = {self.listen_ports[i]}\n")
file.write(f'data = {data}\n') file.write(f"data = {data}\n")
_write_dict(self.train_config, file) _write_dict(self.train_config, file)
def test_classifier(executable): def test_classifier(executable):
"""Test the classification task.""" """Test the classification task."""
num_machines = 2 num_machines = 2
data = create_data(task='binary-classification') data = create_data(task="binary-classification")
partitions = np.array_split(data, num_machines) partitions = np.array_split(data, num_machines)
train_params = { train_params = {
'objective': 'binary', "objective": "binary",
'num_machines': num_machines, "num_machines": num_machines,
} }
clf = DistributedMockup(executable) clf = DistributedMockup(executable)
clf.fit(partitions, train_params) clf.fit(partitions, train_params)
y_probas = clf.predict(predict_config={}) y_probas = clf.predict(predict_config={})
y_pred = y_probas > 0.5 y_pred = y_probas > 0.5
assert accuracy_score(clf.label_, y_pred) == 1. assert accuracy_score(clf.label_, y_pred) == 1.0
def test_regressor(executable): def test_regressor(executable):
"""Test the regression task.""" """Test the regression task."""
num_machines = 2 num_machines = 2
data = create_data(task='regression') data = create_data(task="regression")
partitions = np.array_split(data, num_machines) partitions = np.array_split(data, num_machines)
train_params = { train_params = {
'objective': 'regression', "objective": "regression",
'num_machines': num_machines, "num_machines": num_machines,
} }
reg = DistributedMockup(executable) reg = DistributedMockup(executable)
reg.fit(partitions, train_params) reg.fit(partitions, train_params)
y_pred = reg.predict(predict_config={}) y_pred = reg.predict(predict_config={})
np.testing.assert_allclose(y_pred, reg.label_, rtol=0.2, atol=50.) np.testing.assert_allclose(y_pred, reg.label_, rtol=0.2, atol=50.0)
from pathlib import Path from pathlib import Path
default_exec_file = Path(__file__).absolute().parents[2] / 'lightgbm' default_exec_file = Path(__file__).absolute().parents[2] / "lightgbm"
def pytest_addoption(parser): def pytest_addoption(parser):
parser.addoption('--execfile', action='store', default=str(default_exec_file)) parser.addoption("--execfile", action="store", default=str(default_exec_file))
...@@ -71,9 +71,7 @@ def generate_random_arrow_table( ...@@ -71,9 +71,7 @@ def generate_random_arrow_table(
values: Optional[np.ndarray] = None, values: Optional[np.ndarray] = None,
) -> pa.Table: ) -> pa.Table:
columns = [ columns = [
generate_random_arrow_array( generate_random_arrow_array(num_datapoints, seed + i, generate_nulls=generate_nulls, values=values)
num_datapoints, seed + i, generate_nulls=generate_nulls, values=values
)
for i in range(num_columns) for i in range(num_columns)
] ]
names = [f"col_{i}" for i in range(num_columns)] names = [f"col_{i}" for i in range(num_columns)]
...@@ -156,9 +154,7 @@ def test_dataset_construct_fields_fuzzy(): ...@@ -156,9 +154,7 @@ def test_dataset_construct_fields_fuzzy():
arrow_weights = generate_random_arrow_array(1000, 42, generate_nulls=False) arrow_weights = generate_random_arrow_array(1000, 42, generate_nulls=False)
arrow_groups = pa.chunked_array([[300, 400, 50], [250]], type=pa.int32()) arrow_groups = pa.chunked_array([[300, 400, 50], [250]], type=pa.int32())
arrow_dataset = lgb.Dataset( arrow_dataset = lgb.Dataset(arrow_table, label=arrow_labels, weight=arrow_weights, group=arrow_groups)
arrow_table, label=arrow_labels, weight=arrow_weights, group=arrow_groups
)
arrow_dataset.construct() arrow_dataset.construct()
pandas_dataset = lgb.Dataset( pandas_dataset = lgb.Dataset(
...@@ -171,9 +167,7 @@ def test_dataset_construct_fields_fuzzy(): ...@@ -171,9 +167,7 @@ def test_dataset_construct_fields_fuzzy():
# Check for equality # Check for equality
for field in ("label", "weight", "group"): for field in ("label", "weight", "group"):
np_assert_array_equal( np_assert_array_equal(arrow_dataset.get_field(field), pandas_dataset.get_field(field), strict=True)
arrow_dataset.get_field(field), pandas_dataset.get_field(field), strict=True
)
np_assert_array_equal(arrow_dataset.get_label(), pandas_dataset.get_label(), strict=True) np_assert_array_equal(arrow_dataset.get_label(), pandas_dataset.get_label(), strict=True)
np_assert_array_equal(arrow_dataset.get_weight(), pandas_dataset.get_weight(), strict=True) np_assert_array_equal(arrow_dataset.get_weight(), pandas_dataset.get_weight(), strict=True)
...@@ -269,9 +263,7 @@ def test_dataset_construct_groups(array_type, group_data, arrow_type): ...@@ -269,9 +263,7 @@ def test_dataset_construct_groups(array_type, group_data, arrow_type):
], ],
) )
@pytest.mark.parametrize("arrow_type", _INTEGER_TYPES + _FLOAT_TYPES) @pytest.mark.parametrize("arrow_type", _INTEGER_TYPES + _FLOAT_TYPES)
def test_dataset_construct_init_scores_array( def test_dataset_construct_init_scores_array(array_type: Any, init_score_data: Any, arrow_type: Any):
array_type: Any, init_score_data: Any, arrow_type: Any
):
data = generate_dummy_arrow_table() data = generate_dummy_arrow_table()
init_scores = array_type(init_score_data, type=arrow_type) init_scores = array_type(init_score_data, type=arrow_type)
dataset = lgb.Dataset(data, init_score=init_scores, params=dummy_dataset_params()) dataset = lgb.Dataset(data, init_score=init_scores, params=dummy_dataset_params())
...@@ -320,9 +312,7 @@ def assert_equal_predict_arrow_pandas(booster: lgb.Booster, data: pa.Table): ...@@ -320,9 +312,7 @@ def assert_equal_predict_arrow_pandas(booster: lgb.Booster, data: pa.Table):
np_assert_array_equal(p_pred_contrib_arrow, p_pred_contrib_pandas, strict=True) np_assert_array_equal(p_pred_contrib_arrow, p_pred_contrib_pandas, strict=True)
p_first_iter_arrow = booster.predict(data, start_iteration=0, num_iteration=1, raw_score=True) p_first_iter_arrow = booster.predict(data, start_iteration=0, num_iteration=1, raw_score=True)
p_first_iter_pandas = booster.predict( p_first_iter_pandas = booster.predict(data.to_pandas(), start_iteration=0, num_iteration=1, raw_score=True)
data.to_pandas(), start_iteration=0, num_iteration=1, raw_score=True
)
np_assert_array_equal(p_first_iter_arrow, p_first_iter_pandas, strict=True) np_assert_array_equal(p_first_iter_arrow, p_first_iter_pandas, strict=True)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment