Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
tianlh
LightGBM-DCU
Commits
1b792e71
Unverified
Commit
1b792e71
authored
Feb 21, 2024
by
James Lamb
Committed by
GitHub
Feb 21, 2024
Browse files
[ci] [python-package] enable ruff-format on tests and examples (#6317)
parent
b60068c8
Changes
30
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
484 additions
and
477 deletions
+484
-477
.pre-commit-config.yaml
.pre-commit-config.yaml
+8
-6
examples/python-guide/advanced_example.py
examples/python-guide/advanced_example.py
+88
-92
examples/python-guide/dask/ranking.py
examples/python-guide/dask/ranking.py
+6
-17
examples/python-guide/dataset_from_multi_hdf5.py
examples/python-guide/dataset_from_multi_hdf5.py
+16
-18
examples/python-guide/logistic_regression.py
examples/python-guide/logistic_regression.py
+20
-35
examples/python-guide/notebooks/interactive_plot_example.ipynb
...les/python-guide/notebooks/interactive_plot_example.ipynb
+62
-59
examples/python-guide/plot_example.py
examples/python-guide/plot_example.py
+18
-25
examples/python-guide/simple_example.py
examples/python-guide/simple_example.py
+21
-23
examples/python-guide/sklearn_example.py
examples/python-guide/sklearn_example.py
+22
-36
python-package/lightgbm/basic.py
python-package/lightgbm/basic.py
+17
-3
python-package/lightgbm/callback.py
python-package/lightgbm/callback.py
+8
-2
python-package/lightgbm/dask.py
python-package/lightgbm/dask.py
+30
-6
python-package/lightgbm/engine.py
python-package/lightgbm/engine.py
+15
-4
python-package/lightgbm/sklearn.py
python-package/lightgbm/sklearn.py
+34
-7
python-package/pyproject.toml
python-package/pyproject.toml
+6
-3
tests/c_api_test/test_.py
tests/c_api_test/test_.py
+58
-76
tests/cpp_tests/test.py
tests/cpp_tests/test.py
+1
-1
tests/distributed/_test_distributed.py
tests/distributed/_test_distributed.py
+47
-47
tests/distributed/conftest.py
tests/distributed/conftest.py
+2
-2
tests/python_package_test/test_arrow.py
tests/python_package_test/test_arrow.py
+5
-15
No files found.
.pre-commit-config.yaml
View file @
1b792e71
...
...
@@ -7,6 +7,12 @@ exclude: |
)$
repos
:
-
repo
:
https://github.com/pycqa/isort
rev
:
5.13.2
hooks
:
-
id
:
isort
name
:
isort (python)
args
:
[
"
--settings-path"
,
"
python-package/pyproject.toml"
]
-
repo
:
https://github.com/astral-sh/ruff-pre-commit
# Ruff version.
rev
:
v0.2.1
...
...
@@ -14,12 +20,8 @@ repos:
# Run the linter.
-
id
:
ruff
args
:
[
"
--config"
,
"
python-package/pyproject.toml"
]
types_or
:
[
python
,
jupyter
]
# Run the formatter.
-
id
:
ruff-format
args
:
[
"
--config"
,
"
python-package/pyproject.toml"
]
-
repo
:
https://github.com/pycqa/isort
rev
:
5.13.2
hooks
:
-
id
:
isort
name
:
isort (python)
args
:
[
"
--settings-path"
,
"
python-package/pyproject.toml"
]
types_or
:
[
python
,
jupyter
]
examples/python-guide/advanced_example.py
View file @
1b792e71
...
...
@@ -10,13 +10,13 @@ from sklearn.metrics import roc_auc_score
import
lightgbm
as
lgb
print
(
'
Loading data...
'
)
print
(
"
Loading data...
"
)
# load or create your dataset
binary_example_dir
=
Path
(
__file__
).
absolute
().
parents
[
1
]
/
'
binary_classification
'
df_train
=
pd
.
read_csv
(
str
(
binary_example_dir
/
'
binary.train
'
),
header
=
None
,
sep
=
'
\t
'
)
df_test
=
pd
.
read_csv
(
str
(
binary_example_dir
/
'
binary.test
'
),
header
=
None
,
sep
=
'
\t
'
)
W_train
=
pd
.
read_csv
(
str
(
binary_example_dir
/
'
binary.train.weight
'
),
header
=
None
)[
0
]
W_test
=
pd
.
read_csv
(
str
(
binary_example_dir
/
'
binary.test.weight
'
),
header
=
None
)[
0
]
binary_example_dir
=
Path
(
__file__
).
absolute
().
parents
[
1
]
/
"
binary_classification
"
df_train
=
pd
.
read_csv
(
str
(
binary_example_dir
/
"
binary.train
"
),
header
=
None
,
sep
=
"
\t
"
)
df_test
=
pd
.
read_csv
(
str
(
binary_example_dir
/
"
binary.test
"
),
header
=
None
,
sep
=
"
\t
"
)
W_train
=
pd
.
read_csv
(
str
(
binary_example_dir
/
"
binary.train.weight
"
),
header
=
None
)[
0
]
W_test
=
pd
.
read_csv
(
str
(
binary_example_dir
/
"
binary.test.weight
"
),
header
=
None
)[
0
]
y_train
=
df_train
[
0
]
y_test
=
df_test
[
0
]
...
...
@@ -27,72 +27,72 @@ num_train, num_feature = X_train.shape
# create dataset for lightgbm
# if you want to re-use data, remember to set free_raw_data=False
lgb_train
=
lgb
.
Dataset
(
X_train
,
y_train
,
weight
=
W_train
,
free_raw_data
=
False
)
lgb_eval
=
lgb
.
Dataset
(
X_test
,
y_test
,
reference
=
lgb_train
,
weight
=
W_test
,
free_raw_data
=
False
)
lgb_train
=
lgb
.
Dataset
(
X_train
,
y_train
,
weight
=
W_train
,
free_raw_data
=
False
)
lgb_eval
=
lgb
.
Dataset
(
X_test
,
y_test
,
reference
=
lgb_train
,
weight
=
W_test
,
free_raw_data
=
False
)
# specify your configurations as a dict
params
=
{
'
boosting_type
'
:
'
gbdt
'
,
'
objective
'
:
'
binary
'
,
'
metric
'
:
'
binary_logloss
'
,
'
num_leaves
'
:
31
,
'
learning_rate
'
:
0.05
,
'
feature_fraction
'
:
0.9
,
'
bagging_fraction
'
:
0.8
,
'
bagging_freq
'
:
5
,
'
verbose
'
:
0
"
boosting_type
"
:
"
gbdt
"
,
"
objective
"
:
"
binary
"
,
"
metric
"
:
"
binary_logloss
"
,
"
num_leaves
"
:
31
,
"
learning_rate
"
:
0.05
,
"
feature_fraction
"
:
0.9
,
"
bagging_fraction
"
:
0.8
,
"
bagging_freq
"
:
5
,
"
verbose
"
:
0
,
}
# generate feature names
feature_name
=
[
f
'
feature_
{
col
}
'
for
col
in
range
(
num_feature
)]
feature_name
=
[
f
"
feature_
{
col
}
"
for
col
in
range
(
num_feature
)]
print
(
'
Starting training...
'
)
print
(
"
Starting training...
"
)
# feature_name and categorical_feature
gbm
=
lgb
.
train
(
params
,
lgb_train
,
num_boost_round
=
10
,
valid_sets
=
lgb_train
,
# eval training data
feature_name
=
feature_name
,
categorical_feature
=
[
21
])
print
(
'Finished first 10 rounds...'
)
gbm
=
lgb
.
train
(
params
,
lgb_train
,
num_boost_round
=
10
,
valid_sets
=
lgb_train
,
# eval training data
feature_name
=
feature_name
,
categorical_feature
=
[
21
],
)
print
(
"Finished first 10 rounds..."
)
# check feature name
print
(
f
'
7th feature name is:
{
lgb_train
.
feature_name
[
6
]
}
'
)
print
(
f
"
7th feature name is:
{
lgb_train
.
feature_name
[
6
]
}
"
)
print
(
'
Saving model...
'
)
print
(
"
Saving model...
"
)
# save model to file
gbm
.
save_model
(
'
model.txt
'
)
gbm
.
save_model
(
"
model.txt
"
)
print
(
'
Dumping model to JSON...
'
)
print
(
"
Dumping model to JSON...
"
)
# dump model to JSON (and save to file)
model_json
=
gbm
.
dump_model
()
with
open
(
'
model.json
'
,
'
w+
'
)
as
f
:
with
open
(
"
model.json
"
,
"
w+
"
)
as
f
:
json
.
dump
(
model_json
,
f
,
indent
=
4
)
# feature names
print
(
f
'
Feature names:
{
gbm
.
feature_name
()
}
'
)
print
(
f
"
Feature names:
{
gbm
.
feature_name
()
}
"
)
# feature importances
print
(
f
'
Feature importances:
{
list
(
gbm
.
feature_importance
())
}
'
)
print
(
f
"
Feature importances:
{
list
(
gbm
.
feature_importance
())
}
"
)
print
(
'
Loading model to predict...
'
)
print
(
"
Loading model to predict...
"
)
# load model to predict
bst
=
lgb
.
Booster
(
model_file
=
'
model.txt
'
)
bst
=
lgb
.
Booster
(
model_file
=
"
model.txt
"
)
# can only predict with the best iteration (or the saving iteration)
y_pred
=
bst
.
predict
(
X_test
)
# eval with loaded model
auc_loaded_model
=
roc_auc_score
(
y_test
,
y_pred
)
print
(
f
"The ROC AUC of loaded model's prediction is:
{
auc_loaded_model
}
"
)
print
(
'
Dumping and loading model with pickle...
'
)
print
(
"
Dumping and loading model with pickle...
"
)
# dump model with pickle
with
open
(
'
model.pkl
'
,
'
wb
'
)
as
fout
:
with
open
(
"
model.pkl
"
,
"
wb
"
)
as
fout
:
pickle
.
dump
(
gbm
,
fout
)
# load model with pickle to predict
with
open
(
'
model.pkl
'
,
'
rb
'
)
as
fin
:
with
open
(
"
model.pkl
"
,
"
rb
"
)
as
fin
:
pkl_bst
=
pickle
.
load
(
fin
)
# can predict with any iteration when loaded in pickle way
y_pred
=
pkl_bst
.
predict
(
X_test
,
num_iteration
=
7
)
...
...
@@ -104,36 +104,36 @@ print(f"The ROC AUC of pickled model's prediction is: {auc_pickled_model}")
# init_model accepts:
# 1. model file name
# 2. Booster()
gbm
=
lgb
.
train
(
params
,
lgb_train
,
num_boost_round
=
10
,
init_model
=
'model.txt'
,
valid_sets
=
lgb_eval
)
gbm
=
lgb
.
train
(
params
,
lgb_train
,
num_boost_round
=
10
,
init_model
=
"model.txt"
,
valid_sets
=
lgb_eval
)
print
(
'
Finished 10 - 20 rounds with model file...
'
)
print
(
"
Finished 10 - 20 rounds with model file...
"
)
# decay learning rates
# reset_parameter callback accepts:
# 1. list with length = num_boost_round
# 2. function(curr_iter)
gbm
=
lgb
.
train
(
params
,
lgb_train
,
num_boost_round
=
10
,
init_model
=
gbm
,
valid_sets
=
lgb_eval
,
callbacks
=
[
lgb
.
reset_parameter
(
learning_rate
=
lambda
iter
:
0.05
*
(
0.99
**
iter
))])
gbm
=
lgb
.
train
(
params
,
lgb_train
,
num_boost_round
=
10
,
init_model
=
gbm
,
valid_sets
=
lgb_eval
,
callbacks
=
[
lgb
.
reset_parameter
(
learning_rate
=
lambda
iter
:
0.05
*
(
0.99
**
iter
))],
)
print
(
'
Finished 20 - 30 rounds with decay learning rates...
'
)
print
(
"
Finished 20 - 30 rounds with decay learning rates...
"
)
# change other parameters during training
gbm
=
lgb
.
train
(
params
,
lgb_train
,
num_boost_round
=
10
,
init_model
=
gbm
,
valid_sets
=
lgb_eval
,
callbacks
=
[
lgb
.
reset_parameter
(
bagging_fraction
=
[
0.7
]
*
5
+
[
0.6
]
*
5
)])
gbm
=
lgb
.
train
(
params
,
lgb_train
,
num_boost_round
=
10
,
init_model
=
gbm
,
valid_sets
=
lgb_eval
,
callbacks
=
[
lgb
.
reset_parameter
(
bagging_fraction
=
[
0.7
]
*
5
+
[
0.6
]
*
5
)],
)
print
(
'
Finished 30 - 40 rounds with changing bagging_fraction...
'
)
print
(
"
Finished 30 - 40 rounds with changing bagging_fraction...
"
)
# self-defined objective function
...
...
@@ -141,9 +141,9 @@ print('Finished 30 - 40 rounds with changing bagging_fraction...')
# log likelihood loss
def
loglikelihood
(
preds
,
train_data
):
labels
=
train_data
.
get_label
()
preds
=
1.
/
(
1.
+
np
.
exp
(
-
preds
))
preds
=
1.
0
/
(
1.
0
+
np
.
exp
(
-
preds
))
grad
=
preds
-
labels
hess
=
preds
*
(
1.
-
preds
)
hess
=
preds
*
(
1.
0
-
preds
)
return
grad
,
hess
...
...
@@ -156,22 +156,19 @@ def loglikelihood(preds, train_data):
# Keep this in mind when you use the customization
def
binary_error
(
preds
,
train_data
):
labels
=
train_data
.
get_label
()
preds
=
1.
/
(
1.
+
np
.
exp
(
-
preds
))
return
'
error
'
,
np
.
mean
(
labels
!=
(
preds
>
0.5
)),
False
preds
=
1.
0
/
(
1.
0
+
np
.
exp
(
-
preds
))
return
"
error
"
,
np
.
mean
(
labels
!=
(
preds
>
0.5
)),
False
# Pass custom objective function through params
params_custom_obj
=
copy
.
deepcopy
(
params
)
params_custom_obj
[
'
objective
'
]
=
loglikelihood
params_custom_obj
[
"
objective
"
]
=
loglikelihood
gbm
=
lgb
.
train
(
params_custom_obj
,
lgb_train
,
num_boost_round
=
10
,
init_model
=
gbm
,
feval
=
binary_error
,
valid_sets
=
lgb_eval
)
gbm
=
lgb
.
train
(
params_custom_obj
,
lgb_train
,
num_boost_round
=
10
,
init_model
=
gbm
,
feval
=
binary_error
,
valid_sets
=
lgb_eval
)
print
(
'
Finished 40 - 50 rounds with self-defined objective function and eval metric...
'
)
print
(
"
Finished 40 - 50 rounds with self-defined objective function and eval metric...
"
)
# another self-defined eval metric
...
...
@@ -183,24 +180,26 @@ print('Finished 40 - 50 rounds with self-defined objective function and eval met
# Keep this in mind when you use the customization
def
accuracy
(
preds
,
train_data
):
labels
=
train_data
.
get_label
()
preds
=
1.
/
(
1.
+
np
.
exp
(
-
preds
))
return
'
accuracy
'
,
np
.
mean
(
labels
==
(
preds
>
0.5
)),
True
preds
=
1.
0
/
(
1.
0
+
np
.
exp
(
-
preds
))
return
"
accuracy
"
,
np
.
mean
(
labels
==
(
preds
>
0.5
)),
True
# Pass custom objective function through params
params_custom_obj
=
copy
.
deepcopy
(
params
)
params_custom_obj
[
'
objective
'
]
=
loglikelihood
params_custom_obj
[
"
objective
"
]
=
loglikelihood
gbm
=
lgb
.
train
(
params_custom_obj
,
lgb_train
,
num_boost_round
=
10
,
init_model
=
gbm
,
feval
=
[
binary_error
,
accuracy
],
valid_sets
=
lgb_eval
)
gbm
=
lgb
.
train
(
params_custom_obj
,
lgb_train
,
num_boost_round
=
10
,
init_model
=
gbm
,
feval
=
[
binary_error
,
accuracy
],
valid_sets
=
lgb_eval
,
)
print
(
'
Finished 50 - 60 rounds with self-defined objective function and multiple self-defined eval metrics...
'
)
print
(
"
Finished 50 - 60 rounds with self-defined objective function and multiple self-defined eval metrics...
"
)
print
(
'
Starting a new training job...
'
)
print
(
"
Starting a new training job...
"
)
# callback
...
...
@@ -208,17 +207,14 @@ def reset_metrics():
def
callback
(
env
):
lgb_eval_new
=
lgb
.
Dataset
(
X_test
,
y_test
,
reference
=
lgb_train
)
if
env
.
iteration
-
env
.
begin_iteration
==
5
:
print
(
'Add a new valid dataset at iteration 5...'
)
env
.
model
.
add_valid
(
lgb_eval_new
,
'new_valid'
)
print
(
"Add a new valid dataset at iteration 5..."
)
env
.
model
.
add_valid
(
lgb_eval_new
,
"new_valid"
)
callback
.
before_iteration
=
True
callback
.
order
=
0
return
callback
gbm
=
lgb
.
train
(
params
,
lgb_train
,
num_boost_round
=
10
,
valid_sets
=
lgb_train
,
callbacks
=
[
reset_metrics
()])
gbm
=
lgb
.
train
(
params
,
lgb_train
,
num_boost_round
=
10
,
valid_sets
=
lgb_train
,
callbacks
=
[
reset_metrics
()])
print
(
'
Finished first 10 rounds with callback function...
'
)
print
(
"
Finished first 10 rounds with callback function...
"
)
examples/python-guide/dask/ranking.py
View file @
1b792e71
...
...
@@ -10,9 +10,9 @@ import lightgbm as lgb
if
__name__
==
"__main__"
:
print
(
"loading data"
)
rank_example_dir
=
Path
(
__file__
).
absolute
().
parents
[
2
]
/
'
lambdarank
'
X
,
y
=
load_svmlight_file
(
str
(
rank_example_dir
/
'
rank.train
'
))
group
=
np
.
loadtxt
(
str
(
rank_example_dir
/
'
rank.train.query
'
))
rank_example_dir
=
Path
(
__file__
).
absolute
().
parents
[
2
]
/
"
lambdarank
"
X
,
y
=
load_svmlight_file
(
str
(
rank_example_dir
/
"
rank.train
"
))
group
=
np
.
loadtxt
(
str
(
rank_example_dir
/
"
rank.train.query
"
))
print
(
"initializing a Dask cluster"
)
...
...
@@ -32,25 +32,14 @@ if __name__ == "__main__":
# a sparse boundary to partition the data
X
=
X
.
toarray
()
dX
=
da
.
from_array
(
x
=
X
,
chunks
=
[
(
rows_in_part1
,
rows_in_part2
),
(
num_features
,)
]
)
dX
=
da
.
from_array
(
x
=
X
,
chunks
=
[(
rows_in_part1
,
rows_in_part2
),
(
num_features
,)])
dy
=
da
.
from_array
(
x
=
y
,
chunks
=
[
(
rows_in_part1
,
rows_in_part2
),
]
)
dg
=
da
.
from_array
(
x
=
group
,
chunks
=
[
(
100
,
group
.
size
-
100
)
]
],
)
dg
=
da
.
from_array
(
x
=
group
,
chunks
=
[(
100
,
group
.
size
-
100
)])
print
(
"beginning training"
)
...
...
examples/python-guide/dataset_from_multi_hdf5.py
View file @
1b792e71
...
...
@@ -34,13 +34,13 @@ def create_dataset_from_multiple_hdf(input_flist, batch_size):
data
=
[]
ylist
=
[]
for
f
in
input_flist
:
f
=
h5py
.
File
(
f
,
'r'
)
data
.
append
(
HDFSequence
(
f
[
'X'
],
batch_size
))
ylist
.
append
(
f
[
'Y'
][:])
f
=
h5py
.
File
(
f
,
"r"
)
data
.
append
(
HDFSequence
(
f
[
"X"
],
batch_size
))
ylist
.
append
(
f
[
"Y"
][:])
params
=
{
'
bin_construct_sample_cnt
'
:
200000
,
'
max_bin
'
:
255
,
"
bin_construct_sample_cnt
"
:
200000
,
"
max_bin
"
:
255
,
}
y
=
np
.
concatenate
(
ylist
)
dataset
=
lgb
.
Dataset
(
data
,
label
=
y
,
params
=
params
)
...
...
@@ -51,7 +51,7 @@ def create_dataset_from_multiple_hdf(input_flist, batch_size):
# The reason is that DataFrame column names will be used in Dataset. For a DataFrame with Int64Index
# as columns, Dataset will use column names like ["0", "1", "2", ...]. While for numpy array, column names
# are using the default one assigned in C++ code (dataset_loader.cpp), like ["Column_0", "Column_1", ...].
dataset
.
save_binary
(
'
regression.train.from_hdf.bin
'
)
dataset
.
save_binary
(
"
regression.train.from_hdf.bin
"
)
def
save2hdf
(
input_data
,
fname
,
batch_size
):
...
...
@@ -59,7 +59,7 @@ def save2hdf(input_data, fname, batch_size):
Please note chunk size settings in the implementation for I/O performance optimization.
"""
with
h5py
.
File
(
fname
,
'w'
)
as
f
:
with
h5py
.
File
(
fname
,
"w"
)
as
f
:
for
name
,
data
in
input_data
.
items
():
nrow
,
ncol
=
data
.
shape
if
ncol
==
1
:
...
...
@@ -75,12 +75,12 @@ def save2hdf(input_data, fname, batch_size):
# Also note that the data is stored in row major order to avoid extra copy when passing to
# lightgbm Dataset.
chunk
=
(
batch_size
,
ncol
)
f
.
create_dataset
(
name
,
data
=
data
,
chunks
=
chunk
,
compression
=
'
lzf
'
)
f
.
create_dataset
(
name
,
data
=
data
,
chunks
=
chunk
,
compression
=
"
lzf
"
)
def
generate_hdf
(
input_fname
,
output_basename
,
batch_size
):
# Save to 2 HDF5 files for demonstration.
df
=
pd
.
read_csv
(
input_fname
,
header
=
None
,
sep
=
'
\t
'
)
df
=
pd
.
read_csv
(
input_fname
,
header
=
None
,
sep
=
"
\t
"
)
mid
=
len
(
df
)
//
2
df1
=
df
.
iloc
[:
mid
]
...
...
@@ -88,25 +88,23 @@ def generate_hdf(input_fname, output_basename, batch_size):
# We can store multiple datasets inside a single HDF5 file.
# Separating X and Y for choosing best chunk size for data loading.
fname1
=
f
'
{
output_basename
}
1.h5
'
fname2
=
f
'
{
output_basename
}
2.h5
'
save2hdf
({
'Y'
:
df1
.
iloc
[:,
:
1
],
'X'
:
df1
.
iloc
[:,
1
:]},
fname1
,
batch_size
)
save2hdf
({
'Y'
:
df2
.
iloc
[:,
:
1
],
'X'
:
df2
.
iloc
[:,
1
:]},
fname2
,
batch_size
)
fname1
=
f
"
{
output_basename
}
1.h5
"
fname2
=
f
"
{
output_basename
}
2.h5
"
save2hdf
({
"Y"
:
df1
.
iloc
[:,
:
1
],
"X"
:
df1
.
iloc
[:,
1
:]},
fname1
,
batch_size
)
save2hdf
({
"Y"
:
df2
.
iloc
[:,
:
1
],
"X"
:
df2
.
iloc
[:,
1
:]},
fname2
,
batch_size
)
return
[
fname1
,
fname2
]
def
main
():
batch_size
=
64
output_basename
=
'
regression
'
output_basename
=
"
regression
"
hdf_files
=
generate_hdf
(
str
(
Path
(
__file__
).
absolute
().
parents
[
1
]
/
'regression'
/
'regression.train'
),
output_basename
,
batch_size
str
(
Path
(
__file__
).
absolute
().
parents
[
1
]
/
"regression"
/
"regression.train"
),
output_basename
,
batch_size
)
create_dataset_from_multiple_hdf
(
hdf_files
,
batch_size
=
batch_size
)
if
__name__
==
'
__main__
'
:
if
__name__
==
"
__main__
"
:
main
()
examples/python-guide/logistic_regression.py
View file @
1b792e71
...
...
@@ -24,23 +24,19 @@ import lightgbm as lgb
# single continuous predictor
np
.
random
.
seed
(
0
)
N
=
1000
X
=
pd
.
DataFrame
({
'continuous'
:
range
(
N
),
'categorical'
:
np
.
repeat
([
0
,
1
,
2
,
3
,
4
],
N
/
5
)
})
X
=
pd
.
DataFrame
({
"continuous"
:
range
(
N
),
"categorical"
:
np
.
repeat
([
0
,
1
,
2
,
3
,
4
],
N
/
5
)})
CATEGORICAL_EFFECTS
=
[
-
1
,
-
1
,
-
2
,
-
2
,
2
]
LINEAR_TERM
=
np
.
array
([
-
0.5
+
0.01
*
X
[
'continuous'
][
k
]
+
CATEGORICAL_EFFECTS
[
X
[
'categorical'
][
k
]]
for
k
in
range
(
X
.
shape
[
0
])
])
+
np
.
random
.
normal
(
0
,
1
,
X
.
shape
[
0
])
LINEAR_TERM
=
np
.
array
(
[
-
0.5
+
0.01
*
X
[
"continuous"
][
k
]
+
CATEGORICAL_EFFECTS
[
X
[
"categorical"
][
k
]]
for
k
in
range
(
X
.
shape
[
0
])]
)
+
np
.
random
.
normal
(
0
,
1
,
X
.
shape
[
0
])
TRUE_PROB
=
expit
(
LINEAR_TERM
)
Y
=
np
.
random
.
binomial
(
1
,
TRUE_PROB
,
size
=
N
)
DATA
=
{
'X'
:
X
,
'
probability_labels
'
:
TRUE_PROB
,
'
binary_labels
'
:
Y
,
'
lgb_with_binary_labels
'
:
lgb
.
Dataset
(
X
,
Y
),
'
lgb_with_probability_labels
'
:
lgb
.
Dataset
(
X
,
TRUE_PROB
),
"X"
:
X
,
"
probability_labels
"
:
TRUE_PROB
,
"
binary_labels
"
:
Y
,
"
lgb_with_binary_labels
"
:
lgb
.
Dataset
(
X
,
Y
),
"
lgb_with_probability_labels
"
:
lgb
.
Dataset
(
X
,
TRUE_PROB
),
}
...
...
@@ -72,34 +68,25 @@ def experiment(objective, label_type, data):
np
.
random
.
seed
(
0
)
nrounds
=
5
lgb_data
=
data
[
f
"lgb_with_
{
label_type
}
_labels"
]
params
=
{
'objective'
:
objective
,
'feature_fraction'
:
1
,
'bagging_fraction'
:
1
,
'verbose'
:
-
1
}
params
=
{
"objective"
:
objective
,
"feature_fraction"
:
1
,
"bagging_fraction"
:
1
,
"verbose"
:
-
1
}
time_zero
=
time
.
time
()
gbm
=
lgb
.
train
(
params
,
lgb_data
,
num_boost_round
=
nrounds
)
y_fitted
=
gbm
.
predict
(
data
[
'X'
])
y_fitted
=
gbm
.
predict
(
data
[
"X"
])
y_true
=
data
[
f
"
{
label_type
}
_labels"
]
duration
=
time
.
time
()
-
time_zero
return
{
'time'
:
duration
,
'correlation'
:
np
.
corrcoef
(
y_fitted
,
y_true
)[
0
,
1
],
'logloss'
:
log_loss
(
y_fitted
,
y_true
)
}
return
{
"time"
:
duration
,
"correlation"
:
np
.
corrcoef
(
y_fitted
,
y_true
)[
0
,
1
],
"logloss"
:
log_loss
(
y_fitted
,
y_true
)}
#################
# Observe the behavior of `binary` and `xentropy` objectives
print
(
'
Performance of `binary` objective with binary labels:
'
)
print
(
experiment
(
'
binary
'
,
label_type
=
'
binary
'
,
data
=
DATA
))
print
(
"
Performance of `binary` objective with binary labels:
"
)
print
(
experiment
(
"
binary
"
,
label_type
=
"
binary
"
,
data
=
DATA
))
print
(
'
Performance of `xentropy` objective with binary labels:
'
)
print
(
experiment
(
'
xentropy
'
,
label_type
=
'
binary
'
,
data
=
DATA
))
print
(
"
Performance of `xentropy` objective with binary labels:
"
)
print
(
experiment
(
"
xentropy
"
,
label_type
=
"
binary
"
,
data
=
DATA
))
print
(
'
Performance of `xentropy` objective with probability labels:
'
)
print
(
experiment
(
'
xentropy
'
,
label_type
=
'
probability
'
,
data
=
DATA
))
print
(
"
Performance of `xentropy` objective with probability labels:
"
)
print
(
experiment
(
"
xentropy
"
,
label_type
=
"
probability
"
,
data
=
DATA
))
# Trying this throws an error on non-binary values of y:
# experiment('binary', label_type='probability', DATA)
...
...
@@ -109,9 +96,7 @@ print(experiment('xentropy', label_type='probability', data=DATA))
# there are reasons to suspect that `binary` should run faster when the
# label is an integer instead of a float
K
=
10
A
=
[
experiment
(
'binary'
,
label_type
=
'binary'
,
data
=
DATA
)[
'time'
]
for
k
in
range
(
K
)]
B
=
[
experiment
(
'xentropy'
,
label_type
=
'binary'
,
data
=
DATA
)[
'time'
]
for
k
in
range
(
K
)]
A
=
[
experiment
(
"binary"
,
label_type
=
"binary"
,
data
=
DATA
)[
"time"
]
for
k
in
range
(
K
)]
B
=
[
experiment
(
"xentropy"
,
label_type
=
"binary"
,
data
=
DATA
)[
"time"
]
for
k
in
range
(
K
)]
print
(
f
"Best `binary` time:
{
min
(
A
)
}
"
)
print
(
f
"Best `xentropy` time:
{
min
(
B
)
}
"
)
examples/python-guide/notebooks/interactive_plot_example.ipynb
View file @
1b792e71
This source diff could not be displayed because it is too large. You can
view the blob
instead.
examples/python-guide/plot_example.py
View file @
1b792e71
...
...
@@ -8,13 +8,13 @@ import lightgbm as lgb
if
lgb
.
compat
.
MATPLOTLIB_INSTALLED
:
import
matplotlib.pyplot
as
plt
else
:
raise
ImportError
(
'
You need to install matplotlib and restart your session for plot_example.py.
'
)
raise
ImportError
(
"
You need to install matplotlib and restart your session for plot_example.py.
"
)
print
(
'
Loading data...
'
)
print
(
"
Loading data...
"
)
# load or create your dataset
regression_example_dir
=
Path
(
__file__
).
absolute
().
parents
[
1
]
/
'
regression
'
df_train
=
pd
.
read_csv
(
str
(
regression_example_dir
/
'
regression.train
'
),
header
=
None
,
sep
=
'
\t
'
)
df_test
=
pd
.
read_csv
(
str
(
regression_example_dir
/
'
regression.test
'
),
header
=
None
,
sep
=
'
\t
'
)
regression_example_dir
=
Path
(
__file__
).
absolute
().
parents
[
1
]
/
"
regression
"
df_train
=
pd
.
read_csv
(
str
(
regression_example_dir
/
"
regression.train
"
),
header
=
None
,
sep
=
"
\t
"
)
df_test
=
pd
.
read_csv
(
str
(
regression_example_dir
/
"
regression.test
"
),
header
=
None
,
sep
=
"
\t
"
)
y_train
=
df_train
[
0
]
y_test
=
df_test
[
0
]
...
...
@@ -26,45 +26,38 @@ lgb_train = lgb.Dataset(X_train, y_train)
lgb_test
=
lgb
.
Dataset
(
X_test
,
y_test
,
reference
=
lgb_train
)
# specify your configurations as a dict
params
=
{
'num_leaves'
:
5
,
'metric'
:
(
'l1'
,
'l2'
),
'verbose'
:
0
}
params
=
{
"num_leaves"
:
5
,
"metric"
:
(
"l1"
,
"l2"
),
"verbose"
:
0
}
evals_result
=
{}
# to record eval results for plotting
print
(
'
Starting training...
'
)
print
(
"
Starting training...
"
)
# train
gbm
=
lgb
.
train
(
params
,
lgb_train
,
num_boost_round
=
100
,
valid_sets
=
[
lgb_train
,
lgb_test
],
feature_name
=
[
f
'
f
{
i
+
1
}
'
for
i
in
range
(
X_train
.
shape
[
-
1
])],
feature_name
=
[
f
"
f
{
i
+
1
}
"
for
i
in
range
(
X_train
.
shape
[
-
1
])],
categorical_feature
=
[
21
],
callbacks
=
[
lgb
.
log_evaluation
(
10
),
lgb
.
record_evaluation
(
evals_result
)
]
callbacks
=
[
lgb
.
log_evaluation
(
10
),
lgb
.
record_evaluation
(
evals_result
)],
)
print
(
'
Plotting metrics recorded during training...
'
)
ax
=
lgb
.
plot_metric
(
evals_result
,
metric
=
'
l1
'
)
print
(
"
Plotting metrics recorded during training...
"
)
ax
=
lgb
.
plot_metric
(
evals_result
,
metric
=
"
l1
"
)
plt
.
show
()
print
(
'
Plotting feature importances...
'
)
print
(
"
Plotting feature importances...
"
)
ax
=
lgb
.
plot_importance
(
gbm
,
max_num_features
=
10
)
plt
.
show
()
print
(
'
Plotting split value histogram...
'
)
ax
=
lgb
.
plot_split_value_histogram
(
gbm
,
feature
=
'
f26
'
,
bins
=
'
auto
'
)
print
(
"
Plotting split value histogram...
"
)
ax
=
lgb
.
plot_split_value_histogram
(
gbm
,
feature
=
"
f26
"
,
bins
=
"
auto
"
)
plt
.
show
()
print
(
'
Plotting 54th tree...
'
)
# one tree use categorical feature to split
ax
=
lgb
.
plot_tree
(
gbm
,
tree_index
=
53
,
figsize
=
(
15
,
15
),
show_info
=
[
'
split_gain
'
])
print
(
"
Plotting 54th tree...
"
)
# one tree use categorical feature to split
ax
=
lgb
.
plot_tree
(
gbm
,
tree_index
=
53
,
figsize
=
(
15
,
15
),
show_info
=
[
"
split_gain
"
])
plt
.
show
()
print
(
'
Plotting 54th tree with graphviz...
'
)
graph
=
lgb
.
create_tree_digraph
(
gbm
,
tree_index
=
53
,
name
=
'
Tree54
'
)
print
(
"
Plotting 54th tree with graphviz...
"
)
graph
=
lgb
.
create_tree_digraph
(
gbm
,
tree_index
=
53
,
name
=
"
Tree54
"
)
graph
.
render
(
view
=
True
)
examples/python-guide/simple_example.py
View file @
1b792e71
...
...
@@ -6,11 +6,11 @@ from sklearn.metrics import mean_squared_error
import
lightgbm
as
lgb
print
(
'
Loading data...
'
)
print
(
"
Loading data...
"
)
# load or create your dataset
regression_example_dir
=
Path
(
__file__
).
absolute
().
parents
[
1
]
/
'
regression
'
df_train
=
pd
.
read_csv
(
str
(
regression_example_dir
/
'
regression.train
'
),
header
=
None
,
sep
=
'
\t
'
)
df_test
=
pd
.
read_csv
(
str
(
regression_example_dir
/
'
regression.test
'
),
header
=
None
,
sep
=
'
\t
'
)
regression_example_dir
=
Path
(
__file__
).
absolute
().
parents
[
1
]
/
"
regression
"
df_train
=
pd
.
read_csv
(
str
(
regression_example_dir
/
"
regression.train
"
),
header
=
None
,
sep
=
"
\t
"
)
df_test
=
pd
.
read_csv
(
str
(
regression_example_dir
/
"
regression.test
"
),
header
=
None
,
sep
=
"
\t
"
)
y_train
=
df_train
[
0
]
y_test
=
df_test
[
0
]
...
...
@@ -23,32 +23,30 @@ lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
# specify your configurations as a dict
params
=
{
'
boosting_type
'
:
'
gbdt
'
,
'
objective
'
:
'
regression
'
,
'
metric
'
:
{
'
l2
'
,
'
l1
'
},
'
num_leaves
'
:
31
,
'
learning_rate
'
:
0.05
,
'
feature_fraction
'
:
0.9
,
'
bagging_fraction
'
:
0.8
,
'
bagging_freq
'
:
5
,
'
verbose
'
:
0
"
boosting_type
"
:
"
gbdt
"
,
"
objective
"
:
"
regression
"
,
"
metric
"
:
{
"
l2
"
,
"
l1
"
},
"
num_leaves
"
:
31
,
"
learning_rate
"
:
0.05
,
"
feature_fraction
"
:
0.9
,
"
bagging_fraction
"
:
0.8
,
"
bagging_freq
"
:
5
,
"
verbose
"
:
0
,
}
print
(
'
Starting training...
'
)
print
(
"
Starting training...
"
)
# train
gbm
=
lgb
.
train
(
params
,
lgb_train
,
num_boost_round
=
20
,
valid_sets
=
lgb_eval
,
callbacks
=
[
lgb
.
early_stopping
(
stopping_rounds
=
5
)])
gbm
=
lgb
.
train
(
params
,
lgb_train
,
num_boost_round
=
20
,
valid_sets
=
lgb_eval
,
callbacks
=
[
lgb
.
early_stopping
(
stopping_rounds
=
5
)]
)
print
(
'
Saving model...
'
)
print
(
"
Saving model...
"
)
# save model to file
gbm
.
save_model
(
'
model.txt
'
)
gbm
.
save_model
(
"
model.txt
"
)
print
(
'
Starting predicting...
'
)
print
(
"
Starting predicting...
"
)
# predict
y_pred
=
gbm
.
predict
(
X_test
,
num_iteration
=
gbm
.
best_iteration
)
# eval
rmse_test
=
mean_squared_error
(
y_test
,
y_pred
)
**
0.5
print
(
f
'
The RMSE of prediction is:
{
rmse_test
}
'
)
print
(
f
"
The RMSE of prediction is:
{
rmse_test
}
"
)
examples/python-guide/sklearn_example.py
View file @
1b792e71
...
...
@@ -8,85 +8,71 @@ from sklearn.model_selection import GridSearchCV
import
lightgbm
as
lgb
print
(
'
Loading data...
'
)
print
(
"
Loading data...
"
)
# load or create your dataset
regression_example_dir
=
Path
(
__file__
).
absolute
().
parents
[
1
]
/
'
regression
'
df_train
=
pd
.
read_csv
(
str
(
regression_example_dir
/
'
regression.train
'
),
header
=
None
,
sep
=
'
\t
'
)
df_test
=
pd
.
read_csv
(
str
(
regression_example_dir
/
'
regression.test
'
),
header
=
None
,
sep
=
'
\t
'
)
regression_example_dir
=
Path
(
__file__
).
absolute
().
parents
[
1
]
/
"
regression
"
df_train
=
pd
.
read_csv
(
str
(
regression_example_dir
/
"
regression.train
"
),
header
=
None
,
sep
=
"
\t
"
)
df_test
=
pd
.
read_csv
(
str
(
regression_example_dir
/
"
regression.test
"
),
header
=
None
,
sep
=
"
\t
"
)
y_train
=
df_train
[
0
]
y_test
=
df_test
[
0
]
X_train
=
df_train
.
drop
(
0
,
axis
=
1
)
X_test
=
df_test
.
drop
(
0
,
axis
=
1
)
print
(
'
Starting training...
'
)
print
(
"
Starting training...
"
)
# train
gbm
=
lgb
.
LGBMRegressor
(
num_leaves
=
31
,
learning_rate
=
0.05
,
n_estimators
=
20
)
gbm
.
fit
(
X_train
,
y_train
,
eval_set
=
[(
X_test
,
y_test
)],
eval_metric
=
'l1'
,
callbacks
=
[
lgb
.
early_stopping
(
5
)])
print
(
'Starting predicting...'
)
gbm
=
lgb
.
LGBMRegressor
(
num_leaves
=
31
,
learning_rate
=
0.05
,
n_estimators
=
20
)
gbm
.
fit
(
X_train
,
y_train
,
eval_set
=
[(
X_test
,
y_test
)],
eval_metric
=
"l1"
,
callbacks
=
[
lgb
.
early_stopping
(
5
)])
print
(
"Starting predicting..."
)
# predict
y_pred
=
gbm
.
predict
(
X_test
,
num_iteration
=
gbm
.
best_iteration_
)
# eval
rmse_test
=
mean_squared_error
(
y_test
,
y_pred
)
**
0.5
print
(
f
'
The RMSE of prediction is:
{
rmse_test
}
'
)
print
(
f
"
The RMSE of prediction is:
{
rmse_test
}
"
)
# feature importances
print
(
f
'
Feature importances:
{
list
(
gbm
.
feature_importances_
)
}
'
)
print
(
f
"
Feature importances:
{
list
(
gbm
.
feature_importances_
)
}
"
)
# self-defined eval metric
# f(y_true: array, y_pred: array) -> name: str, eval_result: float, is_higher_better: bool
# Root Mean Squared Logarithmic Error (RMSLE)
def
rmsle
(
y_true
,
y_pred
):
return
'
RMSLE
'
,
np
.
sqrt
(
np
.
mean
(
np
.
power
(
np
.
log1p
(
y_pred
)
-
np
.
log1p
(
y_true
),
2
))),
False
return
"
RMSLE
"
,
np
.
sqrt
(
np
.
mean
(
np
.
power
(
np
.
log1p
(
y_pred
)
-
np
.
log1p
(
y_true
),
2
))),
False
print
(
'
Starting training with custom eval function...
'
)
print
(
"
Starting training with custom eval function...
"
)
# train
gbm
.
fit
(
X_train
,
y_train
,
eval_set
=
[(
X_test
,
y_test
)],
eval_metric
=
rmsle
,
callbacks
=
[
lgb
.
early_stopping
(
5
)])
gbm
.
fit
(
X_train
,
y_train
,
eval_set
=
[(
X_test
,
y_test
)],
eval_metric
=
rmsle
,
callbacks
=
[
lgb
.
early_stopping
(
5
)])
# another self-defined eval metric
# f(y_true: array, y_pred: array) -> name: str, eval_result: float, is_higher_better: bool
# Relative Absolute Error (RAE)
def
rae
(
y_true
,
y_pred
):
return
'
RAE
'
,
np
.
sum
(
np
.
abs
(
y_pred
-
y_true
))
/
np
.
sum
(
np
.
abs
(
np
.
mean
(
y_true
)
-
y_true
)),
False
return
"
RAE
"
,
np
.
sum
(
np
.
abs
(
y_pred
-
y_true
))
/
np
.
sum
(
np
.
abs
(
np
.
mean
(
y_true
)
-
y_true
)),
False
print
(
'
Starting training with multiple custom eval functions...
'
)
print
(
"
Starting training with multiple custom eval functions...
"
)
# train
gbm
.
fit
(
X_train
,
y_train
,
eval_set
=
[(
X_test
,
y_test
)],
eval_metric
=
[
rmsle
,
rae
],
callbacks
=
[
lgb
.
early_stopping
(
5
)])
gbm
.
fit
(
X_train
,
y_train
,
eval_set
=
[(
X_test
,
y_test
)],
eval_metric
=
[
rmsle
,
rae
],
callbacks
=
[
lgb
.
early_stopping
(
5
)])
print
(
'
Starting predicting...
'
)
print
(
"
Starting predicting...
"
)
# predict
y_pred
=
gbm
.
predict
(
X_test
,
num_iteration
=
gbm
.
best_iteration_
)
# eval
rmsle_test
=
rmsle
(
y_test
,
y_pred
)[
1
]
rae_test
=
rae
(
y_test
,
y_pred
)[
1
]
print
(
f
'
The RMSLE of prediction is:
{
rmsle_test
}
'
)
print
(
f
'
The RAE of prediction is:
{
rae_test
}
'
)
print
(
f
"
The RMSLE of prediction is:
{
rmsle_test
}
"
)
print
(
f
"
The RAE of prediction is:
{
rae_test
}
"
)
# other scikit-learn modules
estimator
=
lgb
.
LGBMRegressor
(
num_leaves
=
31
)
param_grid
=
{
'learning_rate'
:
[
0.01
,
0.1
,
1
],
'n_estimators'
:
[
20
,
40
]
}
param_grid
=
{
"learning_rate"
:
[
0.01
,
0.1
,
1
],
"n_estimators"
:
[
20
,
40
]}
gbm
=
GridSearchCV
(
estimator
,
param_grid
,
cv
=
3
)
gbm
.
fit
(
X_train
,
y_train
)
print
(
f
'
Best parameters found by grid search are:
{
gbm
.
best_params_
}
'
)
print
(
f
"
Best parameters found by grid search are:
{
gbm
.
best_params_
}
"
)
python-package/lightgbm/basic.py
View file @
1b792e71
...
...
@@ -18,9 +18,23 @@ from typing import TYPE_CHECKING, Any, Callable, Dict, Iterable, List, Optional,
import
numpy
as
np
import
scipy.sparse
from
.compat
import
(
PANDAS_INSTALLED
,
PYARROW_INSTALLED
,
arrow_cffi
,
arrow_is_floating
,
arrow_is_integer
,
concat
,
dt_DataTable
,
pa_Array
,
pa_chunked_array
,
pa_ChunkedArray
,
pa_compute
,
pa_Table
,
pd_CategoricalDtype
,
pd_DataFrame
,
pd_Series
)
from
.compat
import
(
PANDAS_INSTALLED
,
PYARROW_INSTALLED
,
arrow_cffi
,
arrow_is_floating
,
arrow_is_integer
,
concat
,
dt_DataTable
,
pa_Array
,
pa_chunked_array
,
pa_ChunkedArray
,
pa_compute
,
pa_Table
,
pd_CategoricalDtype
,
pd_DataFrame
,
pd_Series
,
)
from
.libpath
import
find_lib_path
if
TYPE_CHECKING
:
...
...
python-package/lightgbm/callback.py
View file @
1b792e71
...
...
@@ -5,8 +5,14 @@ from dataclasses import dataclass
from
functools
import
partial
from
typing
import
TYPE_CHECKING
,
Any
,
Callable
,
Dict
,
List
,
Optional
,
Union
from
.basic
import
(
Booster
,
_ConfigAliases
,
_LGBM_BoosterEvalMethodResultType
,
_LGBM_BoosterEvalMethodResultWithStandardDeviationType
,
_log_info
,
_log_warning
)
from
.basic
import
(
Booster
,
_ConfigAliases
,
_LGBM_BoosterEvalMethodResultType
,
_LGBM_BoosterEvalMethodResultWithStandardDeviationType
,
_log_info
,
_log_warning
,
)
if
TYPE_CHECKING
:
from
.engine
import
CVBooster
...
...
python-package/lightgbm/dask.py
View file @
1b792e71
...
...
@@ -19,12 +19,36 @@ import numpy as np
import
scipy.sparse
as
ss
from
.basic
import
LightGBMError
,
_choose_param_value
,
_ConfigAliases
,
_log_info
,
_log_warning
from
.compat
import
(
DASK_INSTALLED
,
PANDAS_INSTALLED
,
SKLEARN_INSTALLED
,
Client
,
Future
,
LGBMNotFittedError
,
concat
,
dask_Array
,
dask_array_from_delayed
,
dask_bag_from_delayed
,
dask_DataFrame
,
dask_Series
,
default_client
,
delayed
,
pd_DataFrame
,
pd_Series
,
wait
)
from
.sklearn
import
(
LGBMClassifier
,
LGBMModel
,
LGBMRanker
,
LGBMRegressor
,
_LGBM_ScikitCustomObjectiveFunction
,
_LGBM_ScikitEvalMetricType
,
_lgbmmodel_doc_custom_eval_note
,
_lgbmmodel_doc_fit
,
_lgbmmodel_doc_predict
)
from
.compat
import
(
DASK_INSTALLED
,
PANDAS_INSTALLED
,
SKLEARN_INSTALLED
,
Client
,
Future
,
LGBMNotFittedError
,
concat
,
dask_Array
,
dask_array_from_delayed
,
dask_bag_from_delayed
,
dask_DataFrame
,
dask_Series
,
default_client
,
delayed
,
pd_DataFrame
,
pd_Series
,
wait
,
)
from
.sklearn
import
(
LGBMClassifier
,
LGBMModel
,
LGBMRanker
,
LGBMRegressor
,
_LGBM_ScikitCustomObjectiveFunction
,
_LGBM_ScikitEvalMetricType
,
_lgbmmodel_doc_custom_eval_note
,
_lgbmmodel_doc_fit
,
_lgbmmodel_doc_predict
,
)
__all__
=
[
'DaskLGBMClassifier'
,
...
...
python-package/lightgbm/engine.py
View file @
1b792e71
...
...
@@ -10,10 +10,21 @@ from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
import
numpy
as
np
from
.
import
callback
from
.basic
import
(
Booster
,
Dataset
,
LightGBMError
,
_choose_param_value
,
_ConfigAliases
,
_InnerPredictor
,
_LGBM_BoosterEvalMethodResultType
,
_LGBM_BoosterEvalMethodResultWithStandardDeviationType
,
_LGBM_CategoricalFeatureConfiguration
,
_LGBM_CustomObjectiveFunction
,
_LGBM_EvalFunctionResultType
,
_LGBM_FeatureNameConfiguration
,
_log_warning
)
from
.basic
import
(
Booster
,
Dataset
,
LightGBMError
,
_choose_param_value
,
_ConfigAliases
,
_InnerPredictor
,
_LGBM_BoosterEvalMethodResultType
,
_LGBM_BoosterEvalMethodResultWithStandardDeviationType
,
_LGBM_CategoricalFeatureConfiguration
,
_LGBM_CustomObjectiveFunction
,
_LGBM_EvalFunctionResultType
,
_LGBM_FeatureNameConfiguration
,
_log_warning
,
)
from
.compat
import
SKLEARN_INSTALLED
,
_LGBMBaseCrossValidator
,
_LGBMGroupKFold
,
_LGBMStratifiedKFold
__all__
=
[
...
...
python-package/lightgbm/sklearn.py
View file @
1b792e71
...
...
@@ -8,14 +8,41 @@ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
import
numpy
as
np
import
scipy.sparse
from
.basic
import
(
Booster
,
Dataset
,
LightGBMError
,
_choose_param_value
,
_ConfigAliases
,
_LGBM_BoosterBestScoreType
,
_LGBM_CategoricalFeatureConfiguration
,
_LGBM_EvalFunctionResultType
,
_LGBM_FeatureNameConfiguration
,
_LGBM_GroupType
,
_LGBM_InitScoreType
,
_LGBM_LabelType
,
_LGBM_WeightType
,
_log_warning
)
from
.basic
import
(
Booster
,
Dataset
,
LightGBMError
,
_choose_param_value
,
_ConfigAliases
,
_LGBM_BoosterBestScoreType
,
_LGBM_CategoricalFeatureConfiguration
,
_LGBM_EvalFunctionResultType
,
_LGBM_FeatureNameConfiguration
,
_LGBM_GroupType
,
_LGBM_InitScoreType
,
_LGBM_LabelType
,
_LGBM_WeightType
,
_log_warning
,
)
from
.callback
import
_EvalResultDict
,
record_evaluation
from
.compat
import
(
SKLEARN_INSTALLED
,
LGBMNotFittedError
,
_LGBMAssertAllFinite
,
_LGBMCheckArray
,
_LGBMCheckClassificationTargets
,
_LGBMCheckSampleWeight
,
_LGBMCheckXY
,
_LGBMClassifierBase
,
_LGBMComputeSampleWeight
,
_LGBMCpuCount
,
_LGBMLabelEncoder
,
_LGBMModelBase
,
_LGBMRegressorBase
,
dt_DataTable
,
np_random_Generator
,
pd_DataFrame
)
from
.compat
import
(
SKLEARN_INSTALLED
,
LGBMNotFittedError
,
_LGBMAssertAllFinite
,
_LGBMCheckArray
,
_LGBMCheckClassificationTargets
,
_LGBMCheckSampleWeight
,
_LGBMCheckXY
,
_LGBMClassifierBase
,
_LGBMComputeSampleWeight
,
_LGBMCpuCount
,
_LGBMLabelEncoder
,
_LGBMModelBase
,
_LGBMRegressorBase
,
dt_DataTable
,
np_random_Generator
,
pd_DataFrame
,
)
from
.engine
import
train
__all__
=
[
...
...
python-package/pyproject.toml
View file @
1b792e71
...
...
@@ -81,10 +81,14 @@ minimum-version = "0.4.4"
# end:build-system
[tool.isort]
include_trailing_comma
=
true
line_length
=
120
# "vertical hanging indent", to match what ruff-format does
# ref: https://pycqa.github.io/isort/docs/configuration/multi_line_output_modes.html#3-vertical-hanging-indent
multi_line_output
=
3
skip_glob
=
[
"*/external_libs/*"
,
"*/lightgbm-python/*"
"*/lightgbm-python/*"
,
]
[tool.mypy]
...
...
@@ -108,14 +112,13 @@ docstring-code-format = false
exclude
=
[
"build/*.py"
,
"compile/*.py"
,
"examples/*.py"
,
"external_libs/*.py"
,
"lightgbm-python/*.py"
,
"python-package/*.py"
,
"tests/*.py"
]
indent-style
=
"space"
quote-style
=
"double"
skip-magic-trailing-comma
=
false
[tool.ruff.lint]
ignore
=
[
...
...
tests/c_api_test/test_.py
View file @
1b792e71
...
...
@@ -10,7 +10,7 @@ try:
from
lightgbm.basic
import
_LIB
as
LIB
except
ModuleNotFoundError
:
print
(
"Could not import lightgbm Python package, looking for lib_lightgbm at the repo root"
)
if
system
()
in
(
'
Windows
'
,
'
Microsoft
'
):
if
system
()
in
(
"
Windows
"
,
"
Microsoft
"
):
lib_file
=
Path
(
__file__
).
absolute
().
parents
[
2
]
/
"Release"
/
"lib_lightgbm.dll"
else
:
lib_file
=
Path
(
__file__
).
absolute
().
parents
[
2
]
/
"lib_lightgbm.so"
...
...
@@ -25,7 +25,7 @@ dtype_int64 = 3
def
c_str
(
string
):
return
ctypes
.
c_char_p
(
string
.
encode
(
'
utf-8
'
))
return
ctypes
.
c_char_p
(
string
.
encode
(
"
utf-8
"
))
def
load_from_file
(
filename
,
reference
):
...
...
@@ -33,17 +33,13 @@ def load_from_file(filename, reference):
if
reference
is
not
None
:
ref
=
reference
handle
=
ctypes
.
c_void_p
()
LIB
.
LGBM_DatasetCreateFromFile
(
c_str
(
str
(
filename
)),
c_str
(
'max_bin=15'
),
ref
,
ctypes
.
byref
(
handle
))
LIB
.
LGBM_DatasetCreateFromFile
(
c_str
(
str
(
filename
)),
c_str
(
"max_bin=15"
),
ref
,
ctypes
.
byref
(
handle
))
print
(
LIB
.
LGBM_GetLastError
())
num_data
=
ctypes
.
c_int
(
0
)
LIB
.
LGBM_DatasetGetNumData
(
handle
,
ctypes
.
byref
(
num_data
))
num_feature
=
ctypes
.
c_int
(
0
)
LIB
.
LGBM_DatasetGetNumFeature
(
handle
,
ctypes
.
byref
(
num_feature
))
print
(
f
'
#data:
{
num_data
.
value
}
#feature:
{
num_feature
.
value
}
'
)
print
(
f
"
#data:
{
num_data
.
value
}
#feature:
{
num_feature
.
value
}
"
)
return
handle
...
...
@@ -69,20 +65,22 @@ def load_from_csr(filename, reference):
ctypes
.
c_int64
(
len
(
csr
.
indptr
)),
ctypes
.
c_int64
(
len
(
csr
.
data
)),
ctypes
.
c_int64
(
csr
.
shape
[
1
]),
c_str
(
'
max_bin=15
'
),
c_str
(
"
max_bin=15
"
),
ref
,
ctypes
.
byref
(
handle
))
ctypes
.
byref
(
handle
),
)
num_data
=
ctypes
.
c_int
(
0
)
LIB
.
LGBM_DatasetGetNumData
(
handle
,
ctypes
.
byref
(
num_data
))
num_feature
=
ctypes
.
c_int
(
0
)
LIB
.
LGBM_DatasetGetNumFeature
(
handle
,
ctypes
.
byref
(
num_feature
))
LIB
.
LGBM_DatasetSetField
(
handle
,
c_str
(
'
label
'
),
c_str
(
"
label
"
),
label
.
ctypes
.
data_as
(
ctypes
.
POINTER
(
ctypes
.
c_float
)),
ctypes
.
c_int
(
len
(
label
)),
ctypes
.
c_int
(
dtype_float32
))
print
(
f
'#data:
{
num_data
.
value
}
#feature:
{
num_feature
.
value
}
'
)
ctypes
.
c_int
(
dtype_float32
),
)
print
(
f
"#data:
{
num_data
.
value
}
#feature:
{
num_feature
.
value
}
"
)
return
handle
...
...
@@ -104,20 +102,22 @@ def load_from_csc(filename, reference):
ctypes
.
c_int64
(
len
(
csc
.
indptr
)),
ctypes
.
c_int64
(
len
(
csc
.
data
)),
ctypes
.
c_int64
(
csc
.
shape
[
0
]),
c_str
(
'
max_bin=15
'
),
c_str
(
"
max_bin=15
"
),
ref
,
ctypes
.
byref
(
handle
))
ctypes
.
byref
(
handle
),
)
num_data
=
ctypes
.
c_int
(
0
)
LIB
.
LGBM_DatasetGetNumData
(
handle
,
ctypes
.
byref
(
num_data
))
num_feature
=
ctypes
.
c_int
(
0
)
LIB
.
LGBM_DatasetGetNumFeature
(
handle
,
ctypes
.
byref
(
num_feature
))
LIB
.
LGBM_DatasetSetField
(
handle
,
c_str
(
'
label
'
),
c_str
(
"
label
"
),
label
.
ctypes
.
data_as
(
ctypes
.
POINTER
(
ctypes
.
c_float
)),
ctypes
.
c_int
(
len
(
label
)),
ctypes
.
c_int
(
dtype_float32
))
print
(
f
'#data:
{
num_data
.
value
}
#feature:
{
num_feature
.
value
}
'
)
ctypes
.
c_int
(
dtype_float32
),
)
print
(
f
"#data:
{
num_data
.
value
}
#feature:
{
num_feature
.
value
}
"
)
return
handle
...
...
@@ -137,20 +137,22 @@ def load_from_mat(filename, reference):
ctypes
.
c_int32
(
mat
.
shape
[
0
]),
ctypes
.
c_int32
(
mat
.
shape
[
1
]),
ctypes
.
c_int
(
1
),
c_str
(
'
max_bin=15
'
),
c_str
(
"
max_bin=15
"
),
ref
,
ctypes
.
byref
(
handle
))
ctypes
.
byref
(
handle
),
)
num_data
=
ctypes
.
c_int
(
0
)
LIB
.
LGBM_DatasetGetNumData
(
handle
,
ctypes
.
byref
(
num_data
))
num_feature
=
ctypes
.
c_int
(
0
)
LIB
.
LGBM_DatasetGetNumFeature
(
handle
,
ctypes
.
byref
(
num_feature
))
LIB
.
LGBM_DatasetSetField
(
handle
,
c_str
(
'
label
'
),
c_str
(
"
label
"
),
label
.
ctypes
.
data_as
(
ctypes
.
POINTER
(
ctypes
.
c_float
)),
ctypes
.
c_int
(
len
(
label
)),
ctypes
.
c_int
(
dtype_float32
))
print
(
f
'#data:
{
num_data
.
value
}
#feature:
{
num_feature
.
value
}
'
)
ctypes
.
c_int
(
dtype_float32
),
)
print
(
f
"#data:
{
num_data
.
value
}
#feature:
{
num_feature
.
value
}
"
)
return
handle
...
...
@@ -159,29 +161,26 @@ def free_dataset(handle):
def
test_dataset
():
binary_example_dir
=
Path
(
__file__
).
absolute
().
parents
[
2
]
/
'
examples
'
/
'
binary_classification
'
train
=
load_from_file
(
binary_example_dir
/
'
binary.train
'
,
None
)
test
=
load_from_mat
(
binary_example_dir
/
'
binary.test
'
,
train
)
binary_example_dir
=
Path
(
__file__
).
absolute
().
parents
[
2
]
/
"
examples
"
/
"
binary_classification
"
train
=
load_from_file
(
binary_example_dir
/
"
binary.train
"
,
None
)
test
=
load_from_mat
(
binary_example_dir
/
"
binary.test
"
,
train
)
free_dataset
(
test
)
test
=
load_from_csr
(
binary_example_dir
/
'
binary.test
'
,
train
)
test
=
load_from_csr
(
binary_example_dir
/
"
binary.test
"
,
train
)
free_dataset
(
test
)
test
=
load_from_csc
(
binary_example_dir
/
'
binary.test
'
,
train
)
test
=
load_from_csc
(
binary_example_dir
/
"
binary.test
"
,
train
)
free_dataset
(
test
)
save_to_binary
(
train
,
'
train.binary.bin
'
)
save_to_binary
(
train
,
"
train.binary.bin
"
)
free_dataset
(
train
)
train
=
load_from_file
(
'
train.binary.bin
'
,
None
)
train
=
load_from_file
(
"
train.binary.bin
"
,
None
)
free_dataset
(
train
)
def
test_booster
():
binary_example_dir
=
Path
(
__file__
).
absolute
().
parents
[
2
]
/
'
examples
'
/
'
binary_classification
'
train
=
load_from_mat
(
binary_example_dir
/
'
binary.train
'
,
None
)
test
=
load_from_mat
(
binary_example_dir
/
'
binary.test
'
,
train
)
binary_example_dir
=
Path
(
__file__
).
absolute
().
parents
[
2
]
/
"
examples
"
/
"
binary_classification
"
train
=
load_from_mat
(
binary_example_dir
/
"
binary.train
"
,
None
)
test
=
load_from_mat
(
binary_example_dir
/
"
binary.test
"
,
train
)
booster
=
ctypes
.
c_void_p
()
LIB
.
LGBM_BoosterCreate
(
train
,
c_str
(
"app=binary metric=auc num_leaves=31 verbose=0"
),
ctypes
.
byref
(
booster
))
LIB
.
LGBM_BoosterCreate
(
train
,
c_str
(
"app=binary metric=auc num_leaves=31 verbose=0"
),
ctypes
.
byref
(
booster
))
LIB
.
LGBM_BoosterAddValidData
(
booster
,
test
)
is_finished
=
ctypes
.
c_int
(
0
)
for
i
in
range
(
1
,
51
):
...
...
@@ -189,28 +188,18 @@ def test_booster():
result
=
np
.
array
([
0.0
],
dtype
=
np
.
float64
)
out_len
=
ctypes
.
c_int
(
0
)
LIB
.
LGBM_BoosterGetEval
(
booster
,
ctypes
.
c_int
(
0
),
ctypes
.
byref
(
out_len
),
result
.
ctypes
.
data_as
(
ctypes
.
POINTER
(
ctypes
.
c_double
)))
booster
,
ctypes
.
c_int
(
0
),
ctypes
.
byref
(
out_len
),
result
.
ctypes
.
data_as
(
ctypes
.
POINTER
(
ctypes
.
c_double
))
)
if
i
%
10
==
0
:
print
(
f
'
{
i
}
iteration test AUC
{
result
[
0
]:.
6
f
}
'
)
LIB
.
LGBM_BoosterSaveModel
(
booster
,
ctypes
.
c_int
(
0
),
ctypes
.
c_int
(
-
1
),
ctypes
.
c_int
(
0
),
c_str
(
'model.txt'
))
print
(
f
"
{
i
}
iteration test AUC
{
result
[
0
]:.
6
f
}
"
)
LIB
.
LGBM_BoosterSaveModel
(
booster
,
ctypes
.
c_int
(
0
),
ctypes
.
c_int
(
-
1
),
ctypes
.
c_int
(
0
),
c_str
(
"model.txt"
))
LIB
.
LGBM_BoosterFree
(
booster
)
free_dataset
(
train
)
free_dataset
(
test
)
booster2
=
ctypes
.
c_void_p
()
num_total_model
=
ctypes
.
c_int
(
0
)
LIB
.
LGBM_BoosterCreateFromModelfile
(
c_str
(
'model.txt'
),
ctypes
.
byref
(
num_total_model
),
ctypes
.
byref
(
booster2
))
data
=
np
.
loadtxt
(
str
(
binary_example_dir
/
'binary.test'
),
dtype
=
np
.
float64
)
LIB
.
LGBM_BoosterCreateFromModelfile
(
c_str
(
"model.txt"
),
ctypes
.
byref
(
num_total_model
),
ctypes
.
byref
(
booster2
))
data
=
np
.
loadtxt
(
str
(
binary_example_dir
/
"binary.test"
),
dtype
=
np
.
float64
)
mat
=
data
[:,
1
:]
preb
=
np
.
empty
(
mat
.
shape
[
0
],
dtype
=
np
.
float64
)
num_preb
=
ctypes
.
c_int64
(
0
)
...
...
@@ -225,58 +214,51 @@ def test_booster():
ctypes
.
c_int
(
1
),
ctypes
.
c_int
(
0
),
ctypes
.
c_int
(
25
),
c_str
(
''
),
c_str
(
""
),
ctypes
.
byref
(
num_preb
),
preb
.
ctypes
.
data_as
(
ctypes
.
POINTER
(
ctypes
.
c_double
)))
preb
.
ctypes
.
data_as
(
ctypes
.
POINTER
(
ctypes
.
c_double
)),
)
LIB
.
LGBM_BoosterPredictForFile
(
booster2
,
c_str
(
str
(
binary_example_dir
/
'
binary.test
'
)),
c_str
(
str
(
binary_example_dir
/
"
binary.test
"
)),
ctypes
.
c_int
(
0
),
ctypes
.
c_int
(
0
),
ctypes
.
c_int
(
0
),
ctypes
.
c_int
(
25
),
c_str
(
''
),
c_str
(
'preb.txt'
))
c_str
(
""
),
c_str
(
"preb.txt"
),
)
LIB
.
LGBM_BoosterPredictForFile
(
booster2
,
c_str
(
str
(
binary_example_dir
/
'
binary.test
'
)),
c_str
(
str
(
binary_example_dir
/
"
binary.test
"
)),
ctypes
.
c_int
(
0
),
ctypes
.
c_int
(
0
),
ctypes
.
c_int
(
10
),
ctypes
.
c_int
(
25
),
c_str
(
''
),
c_str
(
'preb.txt'
))
c_str
(
""
),
c_str
(
"preb.txt"
),
)
LIB
.
LGBM_BoosterFree
(
booster2
)
def
test_max_thread_control
():
# at initialization, should be -1
num_threads
=
ctypes
.
c_int
(
0
)
ret
=
LIB
.
LGBM_GetMaxThreads
(
ctypes
.
byref
(
num_threads
)
)
ret
=
LIB
.
LGBM_GetMaxThreads
(
ctypes
.
byref
(
num_threads
))
assert
ret
==
0
assert
num_threads
.
value
==
-
1
# updating that value through the C API should work
ret
=
LIB
.
LGBM_SetMaxThreads
(
ctypes
.
c_int
(
6
)
)
ret
=
LIB
.
LGBM_SetMaxThreads
(
ctypes
.
c_int
(
6
))
assert
ret
==
0
ret
=
LIB
.
LGBM_GetMaxThreads
(
ctypes
.
byref
(
num_threads
)
)
ret
=
LIB
.
LGBM_GetMaxThreads
(
ctypes
.
byref
(
num_threads
))
assert
ret
==
0
assert
num_threads
.
value
==
6
# resetting to any negative number should set it to -1
ret
=
LIB
.
LGBM_SetMaxThreads
(
ctypes
.
c_int
(
-
123
)
)
ret
=
LIB
.
LGBM_SetMaxThreads
(
ctypes
.
c_int
(
-
123
))
assert
ret
==
0
ret
=
LIB
.
LGBM_GetMaxThreads
(
ctypes
.
byref
(
num_threads
)
)
ret
=
LIB
.
LGBM_GetMaxThreads
(
ctypes
.
byref
(
num_threads
))
assert
ret
==
0
assert
num_threads
.
value
==
-
1
tests/cpp_tests/test.py
View file @
1b792e71
...
...
@@ -3,5 +3,5 @@ from pathlib import Path
import
numpy
as
np
preds
=
[
np
.
loadtxt
(
str
(
name
))
for
name
in
Path
(
__file__
).
absolute
().
parent
.
glob
(
'
*.pred
'
)]
preds
=
[
np
.
loadtxt
(
str
(
name
))
for
name
in
Path
(
__file__
).
absolute
().
parent
.
glob
(
"
*.pred
"
)]
np
.
testing
.
assert_allclose
(
preds
[
0
],
preds
[
1
])
tests/distributed/_test_distributed.py
View file @
1b792e71
...
...
@@ -14,16 +14,16 @@ from sklearn.metrics import accuracy_score
TESTS_DIR
=
Path
(
__file__
).
absolute
().
parent
@
pytest
.
fixture
(
scope
=
'
module
'
)
@
pytest
.
fixture
(
scope
=
"
module
"
)
def
executable
(
pytestconfig
)
->
str
:
"""Returns the path to the lightgbm executable."""
return
pytestconfig
.
getoption
(
'
execfile
'
)
return
pytestconfig
.
getoption
(
"
execfile
"
)
def
_find_random_open_port
()
->
int
:
"""Find a random open port on localhost."""
with
socket
.
socket
(
socket
.
AF_INET
,
socket
.
SOCK_STREAM
)
as
s
:
s
.
bind
((
''
,
0
))
s
.
bind
((
""
,
0
))
port
=
s
.
getsockname
()[
1
]
return
port
# noqa: RET504
...
...
@@ -34,7 +34,7 @@ def _generate_n_ports(n: int) -> Generator[int, None, None]:
def
_write_dict
(
d
:
Dict
,
file
:
io
.
TextIOWrapper
)
->
None
:
for
k
,
v
in
d
.
items
():
file
.
write
(
f
'
{
k
}
=
{
v
}
\n
'
)
file
.
write
(
f
"
{
k
}
=
{
v
}
\n
"
)
def
create_data
(
task
:
str
,
n_samples
:
int
=
1_000
)
->
np
.
ndarray
:
...
...
@@ -42,10 +42,10 @@ def create_data(task: str, n_samples: int = 1_000) -> np.ndarray:
The data is returned as a numpy array with the label as the first column.
"""
if
task
==
'
binary-classification
'
:
if
task
==
"
binary-classification
"
:
centers
=
[[
-
4
,
-
4
],
[
4
,
4
]]
X
,
y
=
make_blobs
(
n_samples
,
centers
=
centers
,
random_state
=
42
)
elif
task
==
'
regression
'
:
elif
task
==
"
regression
"
:
X
,
y
=
make_regression
(
n_samples
,
n_features
=
4
,
n_informative
=
2
,
random_state
=
42
)
return
np
.
hstack
([
y
.
reshape
(
-
1
,
1
),
X
])
...
...
@@ -54,22 +54,22 @@ class DistributedMockup:
"""Simulate distributed training."""
default_train_config
=
{
'
task
'
:
'
train
'
,
'
pre_partition
'
:
True
,
'
machine_list_file
'
:
TESTS_DIR
/
'
mlist.txt
'
,
'
tree_learner
'
:
'
data
'
,
'
force_row_wise
'
:
True
,
'
verbose
'
:
0
,
'
num_boost_round
'
:
20
,
'
num_leaves
'
:
15
,
'
num_threads
'
:
2
,
"
task
"
:
"
train
"
,
"
pre_partition
"
:
True
,
"
machine_list_file
"
:
TESTS_DIR
/
"
mlist.txt
"
,
"
tree_learner
"
:
"
data
"
,
"
force_row_wise
"
:
True
,
"
verbose
"
:
0
,
"
num_boost_round
"
:
20
,
"
num_leaves
"
:
15
,
"
num_threads
"
:
2
,
}
default_predict_config
=
{
'
task
'
:
'
predict
'
,
'
data
'
:
TESTS_DIR
/
'
train.txt
'
,
'
input_model
'
:
TESTS_DIR
/
'
model0.txt
'
,
'
output_result
'
:
TESTS_DIR
/
'
predictions.txt
'
,
"
task
"
:
"
predict
"
,
"
data
"
:
TESTS_DIR
/
"
train.txt
"
,
"
input_model
"
:
TESTS_DIR
/
"
model0.txt
"
,
"
output_result
"
:
TESTS_DIR
/
"
predictions.txt
"
,
}
def
__init__
(
self
,
executable
:
str
):
...
...
@@ -77,8 +77,8 @@ class DistributedMockup:
def
worker_train
(
self
,
i
:
int
)
->
subprocess
.
CompletedProcess
:
"""Start the training process on the `i`-th worker."""
config_path
=
TESTS_DIR
/
f
'
train
{
i
}
.conf
'
cmd
=
[
self
.
executable
,
f
'
config=
{
config_path
}
'
]
config_path
=
TESTS_DIR
/
f
"
train
{
i
}
.conf
"
cmd
=
[
self
.
executable
,
f
"
config=
{
config_path
}
"
]
return
subprocess
.
run
(
cmd
)
def
_set_ports
(
self
)
->
None
:
...
...
@@ -92,18 +92,18 @@ class DistributedMockup:
ports
.
update
(
candidates
)
i
+=
1
if
i
==
max_tries
:
raise
RuntimeError
(
'
Unable to find non-colliding ports.
'
)
raise
RuntimeError
(
"
Unable to find non-colliding ports.
"
)
self
.
listen_ports
=
list
(
ports
)
with
open
(
TESTS_DIR
/
'
mlist.txt
'
,
'
wt
'
)
as
f
:
with
open
(
TESTS_DIR
/
"
mlist.txt
"
,
"
wt
"
)
as
f
:
for
port
in
self
.
listen_ports
:
f
.
write
(
f
'
127.0.0.1
{
port
}
\n
'
)
f
.
write
(
f
"
127.0.0.1
{
port
}
\n
"
)
def
_write_data
(
self
,
partitions
:
List
[
np
.
ndarray
])
->
None
:
"""Write all training data as train.txt and each training partition as train{i}.txt."""
all_data
=
np
.
vstack
(
partitions
)
np
.
savetxt
(
str
(
TESTS_DIR
/
'
train.txt
'
),
all_data
,
delimiter
=
','
)
np
.
savetxt
(
str
(
TESTS_DIR
/
"
train.txt
"
),
all_data
,
delimiter
=
","
)
for
i
,
partition
in
enumerate
(
partitions
):
np
.
savetxt
(
str
(
TESTS_DIR
/
f
'
train
{
i
}
.txt
'
),
partition
,
delimiter
=
','
)
np
.
savetxt
(
str
(
TESTS_DIR
/
f
"
train
{
i
}
.txt
"
),
partition
,
delimiter
=
","
)
def
fit
(
self
,
partitions
:
List
[
np
.
ndarray
],
train_config
:
Dict
)
->
None
:
"""Run the distributed training process on a single machine.
...
...
@@ -118,7 +118,7 @@ class DistributedMockup:
"""
self
.
train_config
=
copy
.
deepcopy
(
self
.
default_train_config
)
self
.
train_config
.
update
(
train_config
)
self
.
n_workers
=
self
.
train_config
[
'
num_machines
'
]
self
.
n_workers
=
self
.
train_config
[
"
num_machines
"
]
self
.
_set_ports
()
self
.
_write_data
(
partitions
)
self
.
label_
=
np
.
hstack
([
partition
[:,
0
]
for
partition
in
partitions
])
...
...
@@ -131,7 +131,7 @@ class DistributedMockup:
results
=
[
f
.
result
()
for
f
in
futures
]
for
result
in
results
:
if
result
.
returncode
!=
0
:
raise
RuntimeError
(
'
Error in training
'
)
raise
RuntimeError
(
"
Error in training
"
)
def
predict
(
self
,
predict_config
:
Dict
[
str
,
Any
])
->
np
.
ndarray
:
"""Compute the predictions using the model created in the fit step.
...
...
@@ -141,14 +141,14 @@ class DistributedMockup:
"""
self
.
predict_config
=
copy
.
deepcopy
(
self
.
default_predict_config
)
self
.
predict_config
.
update
(
predict_config
)
config_path
=
TESTS_DIR
/
'
predict.conf
'
with
open
(
config_path
,
'
wt
'
)
as
file
:
config_path
=
TESTS_DIR
/
"
predict.conf
"
with
open
(
config_path
,
"
wt
"
)
as
file
:
_write_dict
(
self
.
predict_config
,
file
)
cmd
=
[
self
.
executable
,
f
'
config=
{
config_path
}
'
]
cmd
=
[
self
.
executable
,
f
"
config=
{
config_path
}
"
]
result
=
subprocess
.
run
(
cmd
)
if
result
.
returncode
!=
0
:
raise
RuntimeError
(
'
Error in prediction
'
)
return
np
.
loadtxt
(
str
(
TESTS_DIR
/
'
predictions.txt
'
))
raise
RuntimeError
(
"
Error in prediction
"
)
return
np
.
loadtxt
(
str
(
TESTS_DIR
/
"
predictions.txt
"
))
def
write_train_config
(
self
,
i
:
int
)
->
None
:
"""Create a file train{i}.conf with the required configuration to train.
...
...
@@ -156,41 +156,41 @@ class DistributedMockup:
Each worker gets a different port and piece of the data, the rest are the
model parameters contained in `self.config`.
"""
with
open
(
TESTS_DIR
/
f
'
train
{
i
}
.conf
'
,
'
wt
'
)
as
file
:
output_model
=
TESTS_DIR
/
f
'
model
{
i
}
.txt
'
data
=
TESTS_DIR
/
f
'
train
{
i
}
.txt
'
file
.
write
(
f
'
output_model =
{
output_model
}
\n
'
)
file
.
write
(
f
'
local_listen_port =
{
self
.
listen_ports
[
i
]
}
\n
'
)
file
.
write
(
f
'
data =
{
data
}
\n
'
)
with
open
(
TESTS_DIR
/
f
"
train
{
i
}
.conf
"
,
"
wt
"
)
as
file
:
output_model
=
TESTS_DIR
/
f
"
model
{
i
}
.txt
"
data
=
TESTS_DIR
/
f
"
train
{
i
}
.txt
"
file
.
write
(
f
"
output_model =
{
output_model
}
\n
"
)
file
.
write
(
f
"
local_listen_port =
{
self
.
listen_ports
[
i
]
}
\n
"
)
file
.
write
(
f
"
data =
{
data
}
\n
"
)
_write_dict
(
self
.
train_config
,
file
)
def
test_classifier
(
executable
):
"""Test the classification task."""
num_machines
=
2
data
=
create_data
(
task
=
'
binary-classification
'
)
data
=
create_data
(
task
=
"
binary-classification
"
)
partitions
=
np
.
array_split
(
data
,
num_machines
)
train_params
=
{
'
objective
'
:
'
binary
'
,
'
num_machines
'
:
num_machines
,
"
objective
"
:
"
binary
"
,
"
num_machines
"
:
num_machines
,
}
clf
=
DistributedMockup
(
executable
)
clf
.
fit
(
partitions
,
train_params
)
y_probas
=
clf
.
predict
(
predict_config
=
{})
y_pred
=
y_probas
>
0.5
assert
accuracy_score
(
clf
.
label_
,
y_pred
)
==
1.
assert
accuracy_score
(
clf
.
label_
,
y_pred
)
==
1.
0
def
test_regressor
(
executable
):
"""Test the regression task."""
num_machines
=
2
data
=
create_data
(
task
=
'
regression
'
)
data
=
create_data
(
task
=
"
regression
"
)
partitions
=
np
.
array_split
(
data
,
num_machines
)
train_params
=
{
'
objective
'
:
'
regression
'
,
'
num_machines
'
:
num_machines
,
"
objective
"
:
"
regression
"
,
"
num_machines
"
:
num_machines
,
}
reg
=
DistributedMockup
(
executable
)
reg
.
fit
(
partitions
,
train_params
)
y_pred
=
reg
.
predict
(
predict_config
=
{})
np
.
testing
.
assert_allclose
(
y_pred
,
reg
.
label_
,
rtol
=
0.2
,
atol
=
50.
)
np
.
testing
.
assert_allclose
(
y_pred
,
reg
.
label_
,
rtol
=
0.2
,
atol
=
50.
0
)
tests/distributed/conftest.py
View file @
1b792e71
from
pathlib
import
Path
default_exec_file
=
Path
(
__file__
).
absolute
().
parents
[
2
]
/
'
lightgbm
'
default_exec_file
=
Path
(
__file__
).
absolute
().
parents
[
2
]
/
"
lightgbm
"
def
pytest_addoption
(
parser
):
parser
.
addoption
(
'
--execfile
'
,
action
=
'
store
'
,
default
=
str
(
default_exec_file
))
parser
.
addoption
(
"
--execfile
"
,
action
=
"
store
"
,
default
=
str
(
default_exec_file
))
tests/python_package_test/test_arrow.py
View file @
1b792e71
...
...
@@ -71,9 +71,7 @@ def generate_random_arrow_table(
values
:
Optional
[
np
.
ndarray
]
=
None
,
)
->
pa
.
Table
:
columns
=
[
generate_random_arrow_array
(
num_datapoints
,
seed
+
i
,
generate_nulls
=
generate_nulls
,
values
=
values
)
generate_random_arrow_array
(
num_datapoints
,
seed
+
i
,
generate_nulls
=
generate_nulls
,
values
=
values
)
for
i
in
range
(
num_columns
)
]
names
=
[
f
"col_
{
i
}
"
for
i
in
range
(
num_columns
)]
...
...
@@ -156,9 +154,7 @@ def test_dataset_construct_fields_fuzzy():
arrow_weights
=
generate_random_arrow_array
(
1000
,
42
,
generate_nulls
=
False
)
arrow_groups
=
pa
.
chunked_array
([[
300
,
400
,
50
],
[
250
]],
type
=
pa
.
int32
())
arrow_dataset
=
lgb
.
Dataset
(
arrow_table
,
label
=
arrow_labels
,
weight
=
arrow_weights
,
group
=
arrow_groups
)
arrow_dataset
=
lgb
.
Dataset
(
arrow_table
,
label
=
arrow_labels
,
weight
=
arrow_weights
,
group
=
arrow_groups
)
arrow_dataset
.
construct
()
pandas_dataset
=
lgb
.
Dataset
(
...
...
@@ -171,9 +167,7 @@ def test_dataset_construct_fields_fuzzy():
# Check for equality
for
field
in
(
"label"
,
"weight"
,
"group"
):
np_assert_array_equal
(
arrow_dataset
.
get_field
(
field
),
pandas_dataset
.
get_field
(
field
),
strict
=
True
)
np_assert_array_equal
(
arrow_dataset
.
get_field
(
field
),
pandas_dataset
.
get_field
(
field
),
strict
=
True
)
np_assert_array_equal
(
arrow_dataset
.
get_label
(),
pandas_dataset
.
get_label
(),
strict
=
True
)
np_assert_array_equal
(
arrow_dataset
.
get_weight
(),
pandas_dataset
.
get_weight
(),
strict
=
True
)
...
...
@@ -269,9 +263,7 @@ def test_dataset_construct_groups(array_type, group_data, arrow_type):
],
)
@
pytest
.
mark
.
parametrize
(
"arrow_type"
,
_INTEGER_TYPES
+
_FLOAT_TYPES
)
def
test_dataset_construct_init_scores_array
(
array_type
:
Any
,
init_score_data
:
Any
,
arrow_type
:
Any
):
def
test_dataset_construct_init_scores_array
(
array_type
:
Any
,
init_score_data
:
Any
,
arrow_type
:
Any
):
data
=
generate_dummy_arrow_table
()
init_scores
=
array_type
(
init_score_data
,
type
=
arrow_type
)
dataset
=
lgb
.
Dataset
(
data
,
init_score
=
init_scores
,
params
=
dummy_dataset_params
())
...
...
@@ -320,9 +312,7 @@ def assert_equal_predict_arrow_pandas(booster: lgb.Booster, data: pa.Table):
np_assert_array_equal
(
p_pred_contrib_arrow
,
p_pred_contrib_pandas
,
strict
=
True
)
p_first_iter_arrow
=
booster
.
predict
(
data
,
start_iteration
=
0
,
num_iteration
=
1
,
raw_score
=
True
)
p_first_iter_pandas
=
booster
.
predict
(
data
.
to_pandas
(),
start_iteration
=
0
,
num_iteration
=
1
,
raw_score
=
True
)
p_first_iter_pandas
=
booster
.
predict
(
data
.
to_pandas
(),
start_iteration
=
0
,
num_iteration
=
1
,
raw_score
=
True
)
np_assert_array_equal
(
p_first_iter_arrow
,
p_first_iter_pandas
,
strict
=
True
)
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment