Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
tianlh
LightGBM-DCU
Commits
1b792e71
Unverified
Commit
1b792e71
authored
Feb 21, 2024
by
James Lamb
Committed by
GitHub
Feb 21, 2024
Browse files
[ci] [python-package] enable ruff-format on tests and examples (#6317)
parent
b60068c8
Changes
30
Hide whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
2746 additions
and
3380 deletions
+2746
-3380
tests/python_package_test/test_basic.py
tests/python_package_test/test_basic.py
+178
-225
tests/python_package_test/test_callback.py
tests/python_package_test/test_callback.py
+5
-8
tests/python_package_test/test_consistency.py
tests/python_package_test/test_consistency.py
+43
-44
tests/python_package_test/test_dask.py
tests/python_package_test/test_dask.py
+357
-666
tests/python_package_test/test_dual.py
tests/python_package_test/test_dual.py
+1
-1
tests/python_package_test/test_engine.py
tests/python_package_test/test_engine.py
+1350
-1620
tests/python_package_test/test_plotting.py
tests/python_package_test/test_plotting.py
+209
-184
tests/python_package_test/test_sklearn.py
tests/python_package_test/test_sklearn.py
+561
-573
tests/python_package_test/test_utilities.py
tests/python_package_test/test_utilities.py
+19
-29
tests/python_package_test/utils.py
tests/python_package_test/utils.py
+23
-30
No files found.
tests/python_package_test/test_basic.py
View file @
1b792e71
...
@@ -19,8 +19,9 @@ from .utils import dummy_obj, load_breast_cancer, mse_obj, np_assert_array_equal
...
@@ -19,8 +19,9 @@ from .utils import dummy_obj, load_breast_cancer, mse_obj, np_assert_array_equal
def
test_basic
(
tmp_path
):
def
test_basic
(
tmp_path
):
X_train
,
X_test
,
y_train
,
y_test
=
train_test_split
(
*
load_breast_cancer
(
return_X_y
=
True
),
X_train
,
X_test
,
y_train
,
y_test
=
train_test_split
(
test_size
=
0.1
,
random_state
=
2
)
*
load_breast_cancer
(
return_X_y
=
True
),
test_size
=
0.1
,
random_state
=
2
)
feature_names
=
[
f
"Column_
{
i
}
"
for
i
in
range
(
X_train
.
shape
[
1
])]
feature_names
=
[
f
"Column_
{
i
}
"
for
i
in
range
(
X_train
.
shape
[
1
])]
feature_names
[
1
]
=
"a"
*
1000
# set one name to a value longer than default buffer size
feature_names
[
1
]
=
"a"
*
1000
# set one name to a value longer than default buffer size
train_data
=
lgb
.
Dataset
(
X_train
,
label
=
y_train
,
feature_name
=
feature_names
)
train_data
=
lgb
.
Dataset
(
X_train
,
label
=
y_train
,
feature_name
=
feature_names
)
...
@@ -34,7 +35,7 @@ def test_basic(tmp_path):
...
@@ -34,7 +35,7 @@ def test_basic(tmp_path):
"verbose"
:
-
1
,
"verbose"
:
-
1
,
"num_threads"
:
1
,
"num_threads"
:
1
,
"max_bin"
:
255
,
"max_bin"
:
255
,
"gpu_use_dp"
:
True
"gpu_use_dp"
:
True
,
}
}
bst
=
lgb
.
Booster
(
params
,
train_data
)
bst
=
lgb
.
Booster
(
params
,
train_data
)
bst
.
add_valid
(
valid_data
,
"valid_1"
)
bst
.
add_valid
(
valid_data
,
"valid_1"
)
...
@@ -49,7 +50,7 @@ def test_basic(tmp_path):
...
@@ -49,7 +50,7 @@ def test_basic(tmp_path):
assert
bst
.
current_iteration
()
==
20
assert
bst
.
current_iteration
()
==
20
assert
bst
.
num_trees
()
==
20
assert
bst
.
num_trees
()
==
20
assert
bst
.
num_model_per_iteration
()
==
1
assert
bst
.
num_model_per_iteration
()
==
1
if
getenv
(
'
TASK
'
,
''
)
!=
'
cuda
'
:
if
getenv
(
"
TASK
"
,
""
)
!=
"
cuda
"
:
assert
bst
.
lower_bound
()
==
pytest
.
approx
(
-
2.9040190126976606
)
assert
bst
.
lower_bound
()
==
pytest
.
approx
(
-
2.9040190126976606
)
assert
bst
.
upper_bound
()
==
pytest
.
approx
(
3.3182142872462883
)
assert
bst
.
upper_bound
()
==
pytest
.
approx
(
3.3182142872462883
)
...
@@ -79,20 +80,19 @@ def test_basic(tmp_path):
...
@@ -79,20 +80,19 @@ def test_basic(tmp_path):
# test that shape is checked during prediction
# test that shape is checked during prediction
bad_X_test
=
X_test
[:,
1
:]
bad_X_test
=
X_test
[:,
1
:]
bad_shape_error_msg
=
"The number of features in data*"
bad_shape_error_msg
=
"The number of features in data*"
np
.
testing
.
assert_raises_regex
(
lgb
.
basic
.
LightGBMError
,
bad_shape_error_msg
,
np
.
testing
.
assert_raises_regex
(
lgb
.
basic
.
LightGBMError
,
bad_shape_error_msg
,
bst
.
predict
,
bad_X_test
)
bst
.
predict
,
bad_X_test
)
np
.
testing
.
assert_raises_regex
(
np
.
testing
.
assert_raises_regex
(
lgb
.
basic
.
LightGBMError
,
bad_shape_error_msg
,
lgb
.
basic
.
LightGBMError
,
bad_shape_error_msg
,
bst
.
predict
,
sparse
.
csr_matrix
(
bad_X_test
)
bst
.
predict
,
sparse
.
csr_matrix
(
bad_X_test
))
)
np
.
testing
.
assert_raises_regex
(
lgb
.
basic
.
LightGBMError
,
bad_shape_error_msg
,
np
.
testing
.
assert_raises_regex
(
bst
.
predict
,
sparse
.
csc_matrix
(
bad_X_test
))
lgb
.
basic
.
LightGBMError
,
bad_shape_error_msg
,
bst
.
predict
,
sparse
.
csc_matrix
(
bad_X_test
)
)
with
open
(
tname
,
"w+b"
)
as
f
:
with
open
(
tname
,
"w+b"
)
as
f
:
dump_svmlight_file
(
bad_X_test
,
y_test
,
f
)
dump_svmlight_file
(
bad_X_test
,
y_test
,
f
)
np
.
testing
.
assert_raises_regex
(
lgb
.
basic
.
LightGBMError
,
bad_shape_error_msg
,
np
.
testing
.
assert_raises_regex
(
lgb
.
basic
.
LightGBMError
,
bad_shape_error_msg
,
bst
.
predict
,
tname
)
bst
.
predict
,
tname
)
with
open
(
tname
,
"w+b"
)
as
f
:
with
open
(
tname
,
"w+b"
)
as
f
:
dump_svmlight_file
(
X_test
,
y_test
,
f
,
zero_based
=
False
)
dump_svmlight_file
(
X_test
,
y_test
,
f
,
zero_based
=
False
)
np
.
testing
.
assert_raises_regex
(
lgb
.
basic
.
LightGBMError
,
bad_shape_error_msg
,
np
.
testing
.
assert_raises_regex
(
lgb
.
basic
.
LightGBMError
,
bad_shape_error_msg
,
bst
.
predict
,
tname
)
bst
.
predict
,
tname
)
class
NumpySequence
(
lgb
.
Sequence
):
class
NumpySequence
(
lgb
.
Sequence
):
...
@@ -108,7 +108,7 @@ class NumpySequence(lgb.Sequence):
...
@@ -108,7 +108,7 @@ class NumpySequence(lgb.Sequence):
elif
isinstance
(
idx
,
slice
):
elif
isinstance
(
idx
,
slice
):
if
not
(
idx
.
step
is
None
or
idx
.
step
==
1
):
if
not
(
idx
.
step
is
None
or
idx
.
step
==
1
):
raise
NotImplementedError
(
"No need to implement, caller will not set step by now"
)
raise
NotImplementedError
(
"No need to implement, caller will not set step by now"
)
return
self
.
ndarray
[
idx
.
start
:
idx
.
stop
]
return
self
.
ndarray
[
idx
.
start
:
idx
.
stop
]
elif
isinstance
(
idx
,
list
):
elif
isinstance
(
idx
,
list
):
return
self
.
ndarray
[
idx
]
return
self
.
ndarray
[
idx
]
else
:
else
:
...
@@ -132,12 +132,12 @@ def _create_sequence_from_ndarray(data, num_seq, batch_size):
...
@@ -132,12 +132,12 @@ def _create_sequence_from_ndarray(data, num_seq, batch_size):
return
seqs
return
seqs
@
pytest
.
mark
.
parametrize
(
'
sample_count
'
,
[
11
,
100
,
None
])
@
pytest
.
mark
.
parametrize
(
"
sample_count
"
,
[
11
,
100
,
None
])
@
pytest
.
mark
.
parametrize
(
'
batch_size
'
,
[
3
,
None
])
@
pytest
.
mark
.
parametrize
(
"
batch_size
"
,
[
3
,
None
])
@
pytest
.
mark
.
parametrize
(
'
include_0_and_nan
'
,
[
False
,
True
])
@
pytest
.
mark
.
parametrize
(
"
include_0_and_nan
"
,
[
False
,
True
])
@
pytest
.
mark
.
parametrize
(
'
num_seq
'
,
[
1
,
3
])
@
pytest
.
mark
.
parametrize
(
"
num_seq
"
,
[
1
,
3
])
def
test_sequence
(
tmpdir
,
sample_count
,
batch_size
,
include_0_and_nan
,
num_seq
):
def
test_sequence
(
tmpdir
,
sample_count
,
batch_size
,
include_0_and_nan
,
num_seq
):
params
=
{
'
bin_construct_sample_cnt
'
:
sample_count
}
params
=
{
"
bin_construct_sample_cnt
"
:
sample_count
}
nrow
=
50
nrow
=
50
half_nrow
=
nrow
//
2
half_nrow
=
nrow
//
2
...
@@ -159,8 +159,8 @@ def test_sequence(tmpdir, sample_count, batch_size, include_0_and_nan, num_seq):
...
@@ -159,8 +159,8 @@ def test_sequence(tmpdir, sample_count, batch_size, include_0_and_nan, num_seq):
X
=
data
[:,
:
-
1
]
X
=
data
[:,
:
-
1
]
Y
=
data
[:,
-
1
]
Y
=
data
[:,
-
1
]
npy_bin_fname
=
tmpdir
/
'
data_from_npy.bin
'
npy_bin_fname
=
tmpdir
/
"
data_from_npy.bin
"
seq_bin_fname
=
tmpdir
/
'
data_from_seq.bin
'
seq_bin_fname
=
tmpdir
/
"
data_from_seq.bin
"
# Create dataset from numpy array directly.
# Create dataset from numpy array directly.
ds
=
lgb
.
Dataset
(
X
,
label
=
Y
,
params
=
params
)
ds
=
lgb
.
Dataset
(
X
,
label
=
Y
,
params
=
params
)
...
@@ -181,9 +181,9 @@ def test_sequence(tmpdir, sample_count, batch_size, include_0_and_nan, num_seq):
...
@@ -181,9 +181,9 @@ def test_sequence(tmpdir, sample_count, batch_size, include_0_and_nan, num_seq):
valid_X
=
valid_data
[:,
:
-
1
]
valid_X
=
valid_data
[:,
:
-
1
]
valid_Y
=
valid_data
[:,
-
1
]
valid_Y
=
valid_data
[:,
-
1
]
valid_npy_bin_fname
=
tmpdir
/
'
valid_data_from_npy.bin
'
valid_npy_bin_fname
=
tmpdir
/
"
valid_data_from_npy.bin
"
valid_seq_bin_fname
=
tmpdir
/
'
valid_data_from_seq.bin
'
valid_seq_bin_fname
=
tmpdir
/
"
valid_data_from_seq.bin
"
valid_seq2_bin_fname
=
tmpdir
/
'
valid_data_from_seq2.bin
'
valid_seq2_bin_fname
=
tmpdir
/
"
valid_data_from_seq2.bin
"
valid_ds
=
lgb
.
Dataset
(
valid_X
,
label
=
valid_Y
,
params
=
params
,
reference
=
ds
)
valid_ds
=
lgb
.
Dataset
(
valid_X
,
label
=
valid_Y
,
params
=
params
,
reference
=
ds
)
valid_ds
.
save_binary
(
valid_npy_bin_fname
)
valid_ds
.
save_binary
(
valid_npy_bin_fname
)
...
@@ -200,7 +200,7 @@ def test_sequence(tmpdir, sample_count, batch_size, include_0_and_nan, num_seq):
...
@@ -200,7 +200,7 @@ def test_sequence(tmpdir, sample_count, batch_size, include_0_and_nan, num_seq):
assert
filecmp
.
cmp
(
valid_npy_bin_fname
,
valid_seq2_bin_fname
)
assert
filecmp
.
cmp
(
valid_npy_bin_fname
,
valid_seq2_bin_fname
)
@
pytest
.
mark
.
parametrize
(
'
num_seq
'
,
[
1
,
2
])
@
pytest
.
mark
.
parametrize
(
"
num_seq
"
,
[
1
,
2
])
def
test_sequence_get_data
(
num_seq
):
def
test_sequence_get_data
(
num_seq
):
nrow
=
20
nrow
=
20
ncol
=
11
ncol
=
11
...
@@ -218,12 +218,13 @@ def test_sequence_get_data(num_seq):
...
@@ -218,12 +218,13 @@ def test_sequence_get_data(num_seq):
def
test_chunked_dataset
():
def
test_chunked_dataset
():
X_train
,
X_test
,
y_train
,
y_test
=
train_test_split
(
*
load_breast_cancer
(
return_X_y
=
True
),
test_size
=
0.1
,
X_train
,
X_test
,
y_train
,
y_test
=
train_test_split
(
random_state
=
2
)
*
load_breast_cancer
(
return_X_y
=
True
),
test_size
=
0.1
,
random_state
=
2
)
chunk_size
=
X_train
.
shape
[
0
]
//
10
+
1
chunk_size
=
X_train
.
shape
[
0
]
//
10
+
1
X_train
=
[
X_train
[
i
*
chunk_size
:
(
i
+
1
)
*
chunk_size
,
:]
for
i
in
range
(
X_train
.
shape
[
0
]
//
chunk_size
+
1
)]
X_train
=
[
X_train
[
i
*
chunk_size
:
(
i
+
1
)
*
chunk_size
,
:]
for
i
in
range
(
X_train
.
shape
[
0
]
//
chunk_size
+
1
)]
X_test
=
[
X_test
[
i
*
chunk_size
:
(
i
+
1
)
*
chunk_size
,
:]
for
i
in
range
(
X_test
.
shape
[
0
]
//
chunk_size
+
1
)]
X_test
=
[
X_test
[
i
*
chunk_size
:
(
i
+
1
)
*
chunk_size
,
:]
for
i
in
range
(
X_test
.
shape
[
0
]
//
chunk_size
+
1
)]
train_data
=
lgb
.
Dataset
(
X_train
,
label
=
y_train
,
params
=
{
"bin_construct_sample_cnt"
:
100
})
train_data
=
lgb
.
Dataset
(
X_train
,
label
=
y_train
,
params
=
{
"bin_construct_sample_cnt"
:
100
})
valid_data
=
train_data
.
create_valid
(
X_test
,
label
=
y_test
,
params
=
{
"bin_construct_sample_cnt"
:
100
})
valid_data
=
train_data
.
create_valid
(
X_test
,
label
=
y_test
,
params
=
{
"bin_construct_sample_cnt"
:
100
})
...
@@ -232,12 +233,13 @@ def test_chunked_dataset():
...
@@ -232,12 +233,13 @@ def test_chunked_dataset():
def
test_chunked_dataset_linear
():
def
test_chunked_dataset_linear
():
X_train
,
X_test
,
y_train
,
y_test
=
train_test_split
(
*
load_breast_cancer
(
return_X_y
=
True
),
test_size
=
0.1
,
X_train
,
X_test
,
y_train
,
y_test
=
train_test_split
(
random_state
=
2
)
*
load_breast_cancer
(
return_X_y
=
True
),
test_size
=
0.1
,
random_state
=
2
)
chunk_size
=
X_train
.
shape
[
0
]
//
10
+
1
chunk_size
=
X_train
.
shape
[
0
]
//
10
+
1
X_train
=
[
X_train
[
i
*
chunk_size
:
(
i
+
1
)
*
chunk_size
,
:]
for
i
in
range
(
X_train
.
shape
[
0
]
//
chunk_size
+
1
)]
X_train
=
[
X_train
[
i
*
chunk_size
:
(
i
+
1
)
*
chunk_size
,
:]
for
i
in
range
(
X_train
.
shape
[
0
]
//
chunk_size
+
1
)]
X_test
=
[
X_test
[
i
*
chunk_size
:
(
i
+
1
)
*
chunk_size
,
:]
for
i
in
range
(
X_test
.
shape
[
0
]
//
chunk_size
+
1
)]
X_test
=
[
X_test
[
i
*
chunk_size
:
(
i
+
1
)
*
chunk_size
,
:]
for
i
in
range
(
X_test
.
shape
[
0
]
//
chunk_size
+
1
)]
params
=
{
"bin_construct_sample_cnt"
:
100
,
'
linear_tree
'
:
True
}
params
=
{
"bin_construct_sample_cnt"
:
100
,
"
linear_tree
"
:
True
}
train_data
=
lgb
.
Dataset
(
X_train
,
label
=
y_train
,
params
=
params
)
train_data
=
lgb
.
Dataset
(
X_train
,
label
=
y_train
,
params
=
params
)
valid_data
=
train_data
.
create_valid
(
X_test
,
label
=
y_test
,
params
=
params
)
valid_data
=
train_data
.
create_valid
(
X_test
,
label
=
y_test
,
params
=
params
)
train_data
.
construct
()
train_data
.
construct
()
...
@@ -246,16 +248,16 @@ def test_chunked_dataset_linear():
...
@@ -246,16 +248,16 @@ def test_chunked_dataset_linear():
def
test_save_dataset_subset_and_load_from_file
(
tmp_path
):
def
test_save_dataset_subset_and_load_from_file
(
tmp_path
):
data
=
np
.
random
.
rand
(
100
,
2
)
data
=
np
.
random
.
rand
(
100
,
2
)
params
=
{
'
max_bin
'
:
50
,
'
min_data_in_bin
'
:
10
}
params
=
{
"
max_bin
"
:
50
,
"
min_data_in_bin
"
:
10
}
ds
=
lgb
.
Dataset
(
data
,
params
=
params
)
ds
=
lgb
.
Dataset
(
data
,
params
=
params
)
ds
.
subset
([
1
,
2
,
3
,
5
,
8
]).
save_binary
(
tmp_path
/
'
subset.bin
'
)
ds
.
subset
([
1
,
2
,
3
,
5
,
8
]).
save_binary
(
tmp_path
/
"
subset.bin
"
)
lgb
.
Dataset
(
tmp_path
/
'
subset.bin
'
,
params
=
params
).
construct
()
lgb
.
Dataset
(
tmp_path
/
"
subset.bin
"
,
params
=
params
).
construct
()
def
test_subset_group
():
def
test_subset_group
():
rank_example_dir
=
Path
(
__file__
).
absolute
().
parents
[
2
]
/
'
examples
'
/
'
lambdarank
'
rank_example_dir
=
Path
(
__file__
).
absolute
().
parents
[
2
]
/
"
examples
"
/
"
lambdarank
"
X_train
,
y_train
=
load_svmlight_file
(
str
(
rank_example_dir
/
'
rank.train
'
))
X_train
,
y_train
=
load_svmlight_file
(
str
(
rank_example_dir
/
"
rank.train
"
))
q_train
=
np
.
loadtxt
(
str
(
rank_example_dir
/
'
rank.train.query
'
))
q_train
=
np
.
loadtxt
(
str
(
rank_example_dir
/
"
rank.train.query
"
))
lgb_train
=
lgb
.
Dataset
(
X_train
,
y_train
,
group
=
q_train
)
lgb_train
=
lgb
.
Dataset
(
X_train
,
y_train
,
group
=
q_train
)
assert
len
(
lgb_train
.
get_group
())
==
201
assert
len
(
lgb_train
.
get_group
())
==
201
subset
=
lgb_train
.
subset
(
list
(
range
(
10
))).
construct
()
subset
=
lgb_train
.
subset
(
list
(
range
(
10
))).
construct
()
...
@@ -294,7 +296,7 @@ def test_add_features_throws_if_datasets_unconstructed():
...
@@ -294,7 +296,7 @@ def test_add_features_throws_if_datasets_unconstructed():
def
test_add_features_equal_data_on_alternating_used_unused
(
tmp_path
):
def
test_add_features_equal_data_on_alternating_used_unused
(
tmp_path
):
X
=
np
.
random
.
random
((
100
,
5
))
X
=
np
.
random
.
random
((
100
,
5
))
X
[:,
[
1
,
3
]]
=
0
X
[:,
[
1
,
3
]]
=
0
names
=
[
f
'
col_
{
i
}
'
for
i
in
range
(
5
)]
names
=
[
f
"
col_
{
i
}
"
for
i
in
range
(
5
)]
for
j
in
range
(
1
,
5
):
for
j
in
range
(
1
,
5
):
d1
=
lgb
.
Dataset
(
X
[:,
:
j
],
feature_name
=
names
[:
j
]).
construct
()
d1
=
lgb
.
Dataset
(
X
[:,
:
j
],
feature_name
=
names
[:
j
]).
construct
()
d2
=
lgb
.
Dataset
(
X
[:,
j
:],
feature_name
=
names
[
j
:]).
construct
()
d2
=
lgb
.
Dataset
(
X
[:,
j
:],
feature_name
=
names
[
j
:]).
construct
()
...
@@ -304,9 +306,9 @@ def test_add_features_equal_data_on_alternating_used_unused(tmp_path):
...
@@ -304,9 +306,9 @@ def test_add_features_equal_data_on_alternating_used_unused(tmp_path):
d
=
lgb
.
Dataset
(
X
,
feature_name
=
names
).
construct
()
d
=
lgb
.
Dataset
(
X
,
feature_name
=
names
).
construct
()
dname
=
tmp_path
/
"d.txt"
dname
=
tmp_path
/
"d.txt"
d
.
_dump_text
(
dname
)
d
.
_dump_text
(
dname
)
with
open
(
d1name
,
'
rt
'
)
as
d1f
:
with
open
(
d1name
,
"
rt
"
)
as
d1f
:
d1txt
=
d1f
.
read
()
d1txt
=
d1f
.
read
()
with
open
(
dname
,
'
rt
'
)
as
df
:
with
open
(
dname
,
"
rt
"
)
as
df
:
dtxt
=
df
.
read
()
dtxt
=
df
.
read
()
assert
dtxt
==
d1txt
assert
dtxt
==
d1txt
...
@@ -314,7 +316,7 @@ def test_add_features_equal_data_on_alternating_used_unused(tmp_path):
...
@@ -314,7 +316,7 @@ def test_add_features_equal_data_on_alternating_used_unused(tmp_path):
def
test_add_features_same_booster_behaviour
(
tmp_path
):
def
test_add_features_same_booster_behaviour
(
tmp_path
):
X
=
np
.
random
.
random
((
100
,
5
))
X
=
np
.
random
.
random
((
100
,
5
))
X
[:,
[
1
,
3
]]
=
0
X
[:,
[
1
,
3
]]
=
0
names
=
[
f
'
col_
{
i
}
'
for
i
in
range
(
5
)]
names
=
[
f
"
col_
{
i
}
"
for
i
in
range
(
5
)]
for
j
in
range
(
1
,
5
):
for
j
in
range
(
1
,
5
):
d1
=
lgb
.
Dataset
(
X
[:,
:
j
],
feature_name
=
names
[:
j
]).
construct
()
d1
=
lgb
.
Dataset
(
X
[:,
:
j
],
feature_name
=
names
[:
j
]).
construct
()
d2
=
lgb
.
Dataset
(
X
[:,
j
:],
feature_name
=
names
[
j
:]).
construct
()
d2
=
lgb
.
Dataset
(
X
[:,
j
:],
feature_name
=
names
[
j
:]).
construct
()
...
@@ -332,9 +334,9 @@ def test_add_features_same_booster_behaviour(tmp_path):
...
@@ -332,9 +334,9 @@ def test_add_features_same_booster_behaviour(tmp_path):
d1name
=
tmp_path
/
"d1.txt"
d1name
=
tmp_path
/
"d1.txt"
b1
.
save_model
(
d1name
)
b1
.
save_model
(
d1name
)
b
.
save_model
(
dname
)
b
.
save_model
(
dname
)
with
open
(
dname
,
'
rt
'
)
as
df
:
with
open
(
dname
,
"
rt
"
)
as
df
:
dtxt
=
df
.
read
()
dtxt
=
df
.
read
()
with
open
(
d1name
,
'
rt
'
)
as
d1f
:
with
open
(
d1name
,
"
rt
"
)
as
d1f
:
d1txt
=
d1f
.
read
()
d1txt
=
d1f
.
read
()
assert
dtxt
==
d1txt
assert
dtxt
==
d1txt
...
@@ -345,11 +347,12 @@ def test_add_features_from_different_sources():
...
@@ -345,11 +347,12 @@ def test_add_features_from_different_sources():
n_col
=
5
n_col
=
5
X
=
np
.
random
.
random
((
n_row
,
n_col
))
X
=
np
.
random
.
random
((
n_row
,
n_col
))
xxs
=
[
X
,
sparse
.
csr_matrix
(
X
),
pd
.
DataFrame
(
X
)]
xxs
=
[
X
,
sparse
.
csr_matrix
(
X
),
pd
.
DataFrame
(
X
)]
names
=
[
f
'
col_
{
i
}
'
for
i
in
range
(
n_col
)]
names
=
[
f
"
col_
{
i
}
"
for
i
in
range
(
n_col
)]
seq
=
_create_sequence_from_ndarray
(
X
,
1
,
30
)
seq
=
_create_sequence_from_ndarray
(
X
,
1
,
30
)
seq_ds
=
lgb
.
Dataset
(
seq
,
feature_name
=
names
,
free_raw_data
=
False
).
construct
()
seq_ds
=
lgb
.
Dataset
(
seq
,
feature_name
=
names
,
free_raw_data
=
False
).
construct
()
npy_list_ds
=
lgb
.
Dataset
([
X
[:
n_row
//
2
,
:],
X
[
n_row
//
2
:,
:]],
npy_list_ds
=
lgb
.
Dataset
(
feature_name
=
names
,
free_raw_data
=
False
).
construct
()
[
X
[:
n_row
//
2
,
:],
X
[
n_row
//
2
:,
:]],
feature_name
=
names
,
free_raw_data
=
False
).
construct
()
immergeable_dds
=
[
seq_ds
,
npy_list_ds
]
immergeable_dds
=
[
seq_ds
,
npy_list_ds
]
for
x_1
in
xxs
:
for
x_1
in
xxs
:
# test that method works even with free_raw_data=True
# test that method works even with free_raw_data=True
...
@@ -373,20 +376,19 @@ def test_add_features_from_different_sources():
...
@@ -373,20 +376,19 @@ def test_add_features_from_different_sources():
d1
.
add_features_from
(
d2
)
d1
.
add_features_from
(
d2
)
assert
isinstance
(
d1
.
get_data
(),
original_type
)
assert
isinstance
(
d1
.
get_data
(),
original_type
)
assert
d1
.
get_data
().
shape
==
(
n_row
,
n_col
*
idx
)
assert
d1
.
get_data
().
shape
==
(
n_row
,
n_col
*
idx
)
res_feature_names
+=
[
f
'
D
{
idx
}
_
{
name
}
'
for
name
in
names
]
res_feature_names
+=
[
f
"
D
{
idx
}
_
{
name
}
"
for
name
in
names
]
assert
d1
.
feature_name
==
res_feature_names
assert
d1
.
feature_name
==
res_feature_names
def
test_add_features_does_not_fail_if_initial_dataset_has_zero_informative_features
(
capsys
):
def
test_add_features_does_not_fail_if_initial_dataset_has_zero_informative_features
(
capsys
):
arr_a
=
np
.
zeros
((
100
,
1
),
dtype
=
np
.
float32
)
arr_a
=
np
.
zeros
((
100
,
1
),
dtype
=
np
.
float32
)
arr_b
=
np
.
random
.
normal
(
size
=
(
100
,
5
))
arr_b
=
np
.
random
.
normal
(
size
=
(
100
,
5
))
dataset_a
=
lgb
.
Dataset
(
arr_a
).
construct
()
dataset_a
=
lgb
.
Dataset
(
arr_a
).
construct
()
expected_msg
=
(
expected_msg
=
(
'
[LightGBM] [Warning] There are no meaningful features which satisfy
'
"
[LightGBM] [Warning] There are no meaningful features which satisfy
"
'
the provided configuration. Decreasing Dataset parameters min_data_in_bin
'
"
the provided configuration. Decreasing Dataset parameters min_data_in_bin
"
'
or min_data_in_leaf and re-constructing Dataset might resolve this warning.
\n
'
"
or min_data_in_leaf and re-constructing Dataset might resolve this warning.
\n
"
)
)
log_lines
=
capsys
.
readouterr
().
out
log_lines
=
capsys
.
readouterr
().
out
assert
expected_msg
in
log_lines
assert
expected_msg
in
log_lines
...
@@ -404,7 +406,7 @@ def test_cegb_affects_behavior(tmp_path):
...
@@ -404,7 +406,7 @@ def test_cegb_affects_behavior(tmp_path):
X
=
np
.
random
.
random
((
100
,
5
))
X
=
np
.
random
.
random
((
100
,
5
))
X
[:,
[
1
,
3
]]
=
0
X
[:,
[
1
,
3
]]
=
0
y
=
np
.
random
.
random
(
100
)
y
=
np
.
random
.
random
(
100
)
names
=
[
f
'
col_
{
i
}
'
for
i
in
range
(
5
)]
names
=
[
f
"
col_
{
i
}
"
for
i
in
range
(
5
)]
ds
=
lgb
.
Dataset
(
X
,
feature_name
=
names
).
construct
()
ds
=
lgb
.
Dataset
(
X
,
feature_name
=
names
).
construct
()
ds
.
set_label
(
y
)
ds
.
set_label
(
y
)
base
=
lgb
.
Booster
(
train_set
=
ds
)
base
=
lgb
.
Booster
(
train_set
=
ds
)
...
@@ -412,19 +414,21 @@ def test_cegb_affects_behavior(tmp_path):
...
@@ -412,19 +414,21 @@ def test_cegb_affects_behavior(tmp_path):
base
.
update
()
base
.
update
()
basename
=
tmp_path
/
"basename.txt"
basename
=
tmp_path
/
"basename.txt"
base
.
save_model
(
basename
)
base
.
save_model
(
basename
)
with
open
(
basename
,
'
rt
'
)
as
f
:
with
open
(
basename
,
"
rt
"
)
as
f
:
basetxt
=
f
.
read
()
basetxt
=
f
.
read
()
# Set extremely harsh penalties, so CEGB will block most splits.
# Set extremely harsh penalties, so CEGB will block most splits.
cases
=
[{
'cegb_penalty_feature_coupled'
:
[
50
,
100
,
10
,
25
,
30
]},
cases
=
[
{
'cegb_penalty_feature_lazy'
:
[
1
,
2
,
3
,
4
,
5
]},
{
"cegb_penalty_feature_coupled"
:
[
50
,
100
,
10
,
25
,
30
]},
{
'cegb_penalty_split'
:
1
}]
{
"cegb_penalty_feature_lazy"
:
[
1
,
2
,
3
,
4
,
5
]},
{
"cegb_penalty_split"
:
1
},
]
for
case
in
cases
:
for
case
in
cases
:
booster
=
lgb
.
Booster
(
train_set
=
ds
,
params
=
case
)
booster
=
lgb
.
Booster
(
train_set
=
ds
,
params
=
case
)
for
_
in
range
(
10
):
for
_
in
range
(
10
):
booster
.
update
()
booster
.
update
()
casename
=
tmp_path
/
"casename.txt"
casename
=
tmp_path
/
"casename.txt"
booster
.
save_model
(
casename
)
booster
.
save_model
(
casename
)
with
open
(
casename
,
'
rt
'
)
as
f
:
with
open
(
casename
,
"
rt
"
)
as
f
:
casetxt
=
f
.
read
()
casetxt
=
f
.
read
()
assert
basetxt
!=
casetxt
assert
basetxt
!=
casetxt
...
@@ -433,17 +437,22 @@ def test_cegb_scaling_equalities(tmp_path):
...
@@ -433,17 +437,22 @@ def test_cegb_scaling_equalities(tmp_path):
X
=
np
.
random
.
random
((
100
,
5
))
X
=
np
.
random
.
random
((
100
,
5
))
X
[:,
[
1
,
3
]]
=
0
X
[:,
[
1
,
3
]]
=
0
y
=
np
.
random
.
random
(
100
)
y
=
np
.
random
.
random
(
100
)
names
=
[
f
'
col_
{
i
}
'
for
i
in
range
(
5
)]
names
=
[
f
"
col_
{
i
}
"
for
i
in
range
(
5
)]
ds
=
lgb
.
Dataset
(
X
,
feature_name
=
names
).
construct
()
ds
=
lgb
.
Dataset
(
X
,
feature_name
=
names
).
construct
()
ds
.
set_label
(
y
)
ds
.
set_label
(
y
)
# Compare pairs of penalties, to ensure scaling works as intended
# Compare pairs of penalties, to ensure scaling works as intended
pairs
=
[({
'cegb_penalty_feature_coupled'
:
[
1
,
2
,
1
,
2
,
1
]},
pairs
=
[
{
'cegb_penalty_feature_coupled'
:
[
0.5
,
1
,
0.5
,
1
,
0.5
],
'cegb_tradeoff'
:
2
}),
(
({
'cegb_penalty_feature_lazy'
:
[
0.01
,
0.02
,
0.03
,
0.04
,
0.05
]},
{
"cegb_penalty_feature_coupled"
:
[
1
,
2
,
1
,
2
,
1
]},
{
'cegb_penalty_feature_lazy'
:
[
0.005
,
0.01
,
0.015
,
0.02
,
0.025
],
'cegb_tradeoff'
:
2
}),
{
"cegb_penalty_feature_coupled"
:
[
0.5
,
1
,
0.5
,
1
,
0.5
],
"cegb_tradeoff"
:
2
},
({
'cegb_penalty_split'
:
1
},
),
{
'cegb_penalty_split'
:
2
,
'cegb_tradeoff'
:
0.5
})]
(
for
(
p1
,
p2
)
in
pairs
:
{
"cegb_penalty_feature_lazy"
:
[
0.01
,
0.02
,
0.03
,
0.04
,
0.05
]},
{
"cegb_penalty_feature_lazy"
:
[
0.005
,
0.01
,
0.015
,
0.02
,
0.025
],
"cegb_tradeoff"
:
2
},
),
({
"cegb_penalty_split"
:
1
},
{
"cegb_penalty_split"
:
2
,
"cegb_tradeoff"
:
0.5
}),
]
for
p1
,
p2
in
pairs
:
booster1
=
lgb
.
Booster
(
train_set
=
ds
,
params
=
p1
)
booster1
=
lgb
.
Booster
(
train_set
=
ds
,
params
=
p1
)
booster2
=
lgb
.
Booster
(
train_set
=
ds
,
params
=
p2
)
booster2
=
lgb
.
Booster
(
train_set
=
ds
,
params
=
p2
)
for
_
in
range
(
10
):
for
_
in
range
(
10
):
...
@@ -453,32 +462,30 @@ def test_cegb_scaling_equalities(tmp_path):
...
@@ -453,32 +462,30 @@ def test_cegb_scaling_equalities(tmp_path):
# Reset booster1's parameters to p2, so the parameter section of the file matches.
# Reset booster1's parameters to p2, so the parameter section of the file matches.
booster1
.
reset_parameter
(
p2
)
booster1
.
reset_parameter
(
p2
)
booster1
.
save_model
(
p1name
)
booster1
.
save_model
(
p1name
)
with
open
(
p1name
,
'
rt
'
)
as
f
:
with
open
(
p1name
,
"
rt
"
)
as
f
:
p1txt
=
f
.
read
()
p1txt
=
f
.
read
()
p2name
=
tmp_path
/
"p2.txt"
p2name
=
tmp_path
/
"p2.txt"
booster2
.
save_model
(
p2name
)
booster2
.
save_model
(
p2name
)
with
open
(
p2name
,
'
rt
'
)
as
f
:
with
open
(
p2name
,
"
rt
"
)
as
f
:
p2txt
=
f
.
read
()
p2txt
=
f
.
read
()
assert
p1txt
==
p2txt
assert
p1txt
==
p2txt
def
test_consistent_state_for_dataset_fields
():
def
test_consistent_state_for_dataset_fields
():
def
check_asserts
(
data
):
def
check_asserts
(
data
):
np
.
testing
.
assert_allclose
(
data
.
label
,
data
.
get_label
())
np
.
testing
.
assert_allclose
(
data
.
label
,
data
.
get_label
())
np
.
testing
.
assert_allclose
(
data
.
label
,
data
.
get_field
(
'
label
'
))
np
.
testing
.
assert_allclose
(
data
.
label
,
data
.
get_field
(
"
label
"
))
assert
not
np
.
isnan
(
data
.
label
[
0
])
assert
not
np
.
isnan
(
data
.
label
[
0
])
assert
not
np
.
isinf
(
data
.
label
[
1
])
assert
not
np
.
isinf
(
data
.
label
[
1
])
np
.
testing
.
assert_allclose
(
data
.
weight
,
data
.
get_weight
())
np
.
testing
.
assert_allclose
(
data
.
weight
,
data
.
get_weight
())
np
.
testing
.
assert_allclose
(
data
.
weight
,
data
.
get_field
(
'
weight
'
))
np
.
testing
.
assert_allclose
(
data
.
weight
,
data
.
get_field
(
"
weight
"
))
assert
not
np
.
isnan
(
data
.
weight
[
0
])
assert
not
np
.
isnan
(
data
.
weight
[
0
])
assert
not
np
.
isinf
(
data
.
weight
[
1
])
assert
not
np
.
isinf
(
data
.
weight
[
1
])
np
.
testing
.
assert_allclose
(
data
.
init_score
,
data
.
get_init_score
())
np
.
testing
.
assert_allclose
(
data
.
init_score
,
data
.
get_init_score
())
np
.
testing
.
assert_allclose
(
data
.
init_score
,
data
.
get_field
(
'
init_score
'
))
np
.
testing
.
assert_allclose
(
data
.
init_score
,
data
.
get_field
(
"
init_score
"
))
assert
not
np
.
isnan
(
data
.
init_score
[
0
])
assert
not
np
.
isnan
(
data
.
init_score
[
0
])
assert
not
np
.
isinf
(
data
.
init_score
[
1
])
assert
not
np
.
isinf
(
data
.
init_score
[
1
])
assert
np
.
all
(
np
.
isclose
([
data
.
label
[
0
],
data
.
weight
[
0
],
data
.
init_score
[
0
]],
assert
np
.
all
(
np
.
isclose
([
data
.
label
[
0
],
data
.
weight
[
0
],
data
.
init_score
[
0
]],
data
.
label
[
0
]))
data
.
label
[
0
]))
assert
data
.
label
[
1
]
==
pytest
.
approx
(
data
.
weight
[
1
])
assert
data
.
label
[
1
]
==
pytest
.
approx
(
data
.
weight
[
1
])
assert
data
.
feature_name
==
data
.
get_feature_name
()
assert
data
.
feature_name
==
data
.
get_feature_name
()
...
@@ -486,10 +493,8 @@ def test_consistent_state_for_dataset_fields():
...
@@ -486,10 +493,8 @@ def test_consistent_state_for_dataset_fields():
sequence
=
np
.
ones
(
y
.
shape
[
0
])
sequence
=
np
.
ones
(
y
.
shape
[
0
])
sequence
[
0
]
=
np
.
nan
sequence
[
0
]
=
np
.
nan
sequence
[
1
]
=
np
.
inf
sequence
[
1
]
=
np
.
inf
feature_names
=
[
f
'f
{
i
}
'
for
i
in
range
(
X
.
shape
[
1
])]
feature_names
=
[
f
"f
{
i
}
"
for
i
in
range
(
X
.
shape
[
1
])]
lgb_data
=
lgb
.
Dataset
(
X
,
sequence
,
lgb_data
=
lgb
.
Dataset
(
X
,
sequence
,
weight
=
sequence
,
init_score
=
sequence
,
feature_name
=
feature_names
).
construct
()
weight
=
sequence
,
init_score
=
sequence
,
feature_name
=
feature_names
).
construct
()
check_asserts
(
lgb_data
)
check_asserts
(
lgb_data
)
lgb_data
=
lgb
.
Dataset
(
X
,
y
).
construct
()
lgb_data
=
lgb
.
Dataset
(
X
,
y
).
construct
()
lgb_data
.
set_label
(
sequence
)
lgb_data
.
set_label
(
sequence
)
...
@@ -500,20 +505,15 @@ def test_consistent_state_for_dataset_fields():
...
@@ -500,20 +505,15 @@ def test_consistent_state_for_dataset_fields():
def
test_dataset_construction_overwrites_user_provided_metadata_fields
():
def
test_dataset_construction_overwrites_user_provided_metadata_fields
():
X
=
np
.
array
([[
1.0
,
2.0
],
[
3.0
,
4.0
]])
X
=
np
.
array
([[
1.0
,
2.0
],
[
3.0
,
4.0
]])
position
=
np
.
array
([
0.0
,
1.0
],
dtype
=
np
.
float32
)
position
=
np
.
array
([
0.0
,
1.0
],
dtype
=
np
.
float32
)
if
getenv
(
'
TASK
'
,
''
)
==
'
cuda
'
:
if
getenv
(
"
TASK
"
,
""
)
==
"
cuda
"
:
position
=
None
position
=
None
dtrain
=
lgb
.
Dataset
(
dtrain
=
lgb
.
Dataset
(
X
,
X
,
params
=
{
params
=
{
"min_data_in_bin"
:
1
,
"min_data_in_leaf"
:
1
,
"verbosity"
:
-
1
},
"min_data_in_bin"
:
1
,
"min_data_in_leaf"
:
1
,
"verbosity"
:
-
1
},
group
=
[
1
,
1
],
group
=
[
1
,
1
],
init_score
=
[
0.312
,
0.708
],
init_score
=
[
0.312
,
0.708
],
label
=
[
1
,
2
],
label
=
[
1
,
2
],
...
@@ -528,17 +528,9 @@ def test_dataset_construction_overwrites_user_provided_metadata_fields():
...
@@ -528,17 +528,9 @@ def test_dataset_construction_overwrites_user_provided_metadata_fields():
assert
dtrain
.
get_init_score
()
==
[
0.312
,
0.708
]
assert
dtrain
.
get_init_score
()
==
[
0.312
,
0.708
]
assert
dtrain
.
label
==
[
1
,
2
]
assert
dtrain
.
label
==
[
1
,
2
]
assert
dtrain
.
get_label
()
==
[
1
,
2
]
assert
dtrain
.
get_label
()
==
[
1
,
2
]
if
getenv
(
'TASK'
,
''
)
!=
'cuda'
:
if
getenv
(
"TASK"
,
""
)
!=
"cuda"
:
np_assert_array_equal
(
np_assert_array_equal
(
dtrain
.
position
,
np
.
array
([
0.0
,
1.0
],
dtype
=
np
.
float32
),
strict
=
True
)
dtrain
.
position
,
np_assert_array_equal
(
dtrain
.
get_position
(),
np
.
array
([
0.0
,
1.0
],
dtype
=
np
.
float32
),
strict
=
True
)
np
.
array
([
0.0
,
1.0
],
dtype
=
np
.
float32
),
strict
=
True
)
np_assert_array_equal
(
dtrain
.
get_position
(),
np
.
array
([
0.0
,
1.0
],
dtype
=
np
.
float32
),
strict
=
True
)
assert
dtrain
.
weight
==
[
0.5
,
1.5
]
assert
dtrain
.
weight
==
[
0.5
,
1.5
]
assert
dtrain
.
get_weight
()
==
[
0.5
,
1.5
]
assert
dtrain
.
get_weight
()
==
[
0.5
,
1.5
]
...
@@ -554,13 +546,11 @@ def test_dataset_construction_overwrites_user_provided_metadata_fields():
...
@@ -554,13 +546,11 @@ def test_dataset_construction_overwrites_user_provided_metadata_fields():
np_assert_array_equal
(
dtrain
.
group
,
expected_group
,
strict
=
True
)
np_assert_array_equal
(
dtrain
.
group
,
expected_group
,
strict
=
True
)
np_assert_array_equal
(
dtrain
.
get_group
(),
expected_group
,
strict
=
True
)
np_assert_array_equal
(
dtrain
.
get_group
(),
expected_group
,
strict
=
True
)
# get_field("group") returns a numpy array with boundaries, instead of size
# get_field("group") returns a numpy array with boundaries, instead of size
np_assert_array_equal
(
np_assert_array_equal
(
dtrain
.
get_field
(
"group"
),
np
.
array
([
0
,
1
,
2
],
dtype
=
np
.
int32
),
strict
=
True
)
dtrain
.
get_field
(
"group"
),
np
.
array
([
0
,
1
,
2
],
dtype
=
np
.
int32
),
strict
=
True
)
expected_init_score
=
np
.
array
([
0.312
,
0.708
],)
expected_init_score
=
np
.
array
(
[
0.312
,
0.708
],
)
np_assert_array_equal
(
dtrain
.
init_score
,
expected_init_score
,
strict
=
True
)
np_assert_array_equal
(
dtrain
.
init_score
,
expected_init_score
,
strict
=
True
)
np_assert_array_equal
(
dtrain
.
get_init_score
(),
expected_init_score
,
strict
=
True
)
np_assert_array_equal
(
dtrain
.
get_init_score
(),
expected_init_score
,
strict
=
True
)
np_assert_array_equal
(
dtrain
.
get_field
(
"init_score"
),
expected_init_score
,
strict
=
True
)
np_assert_array_equal
(
dtrain
.
get_field
(
"init_score"
),
expected_init_score
,
strict
=
True
)
...
@@ -570,16 +560,12 @@ def test_dataset_construction_overwrites_user_provided_metadata_fields():
...
@@ -570,16 +560,12 @@ def test_dataset_construction_overwrites_user_provided_metadata_fields():
np_assert_array_equal
(
dtrain
.
get_label
(),
expected_label
,
strict
=
True
)
np_assert_array_equal
(
dtrain
.
get_label
(),
expected_label
,
strict
=
True
)
np_assert_array_equal
(
dtrain
.
get_field
(
"label"
),
expected_label
,
strict
=
True
)
np_assert_array_equal
(
dtrain
.
get_field
(
"label"
),
expected_label
,
strict
=
True
)
if
getenv
(
'
TASK
'
,
''
)
!=
'
cuda
'
:
if
getenv
(
"
TASK
"
,
""
)
!=
"
cuda
"
:
expected_position
=
np
.
array
([
0.0
,
1.0
],
dtype
=
np
.
float32
)
expected_position
=
np
.
array
([
0.0
,
1.0
],
dtype
=
np
.
float32
)
np_assert_array_equal
(
dtrain
.
position
,
expected_position
,
strict
=
True
)
np_assert_array_equal
(
dtrain
.
position
,
expected_position
,
strict
=
True
)
np_assert_array_equal
(
dtrain
.
get_position
(),
expected_position
,
strict
=
True
)
np_assert_array_equal
(
dtrain
.
get_position
(),
expected_position
,
strict
=
True
)
# NOTE: "position" is converted to int32 on the C++ side
# NOTE: "position" is converted to int32 on the C++ side
np_assert_array_equal
(
np_assert_array_equal
(
dtrain
.
get_field
(
"position"
),
np
.
array
([
0.0
,
1.0
],
dtype
=
np
.
int32
),
strict
=
True
)
dtrain
.
get_field
(
"position"
),
np
.
array
([
0.0
,
1.0
],
dtype
=
np
.
int32
),
strict
=
True
)
expected_weight
=
np
.
array
([
0.5
,
1.5
],
dtype
=
np
.
float32
)
expected_weight
=
np
.
array
([
0.5
,
1.5
],
dtype
=
np
.
float32
)
np_assert_array_equal
(
dtrain
.
weight
,
expected_weight
,
strict
=
True
)
np_assert_array_equal
(
dtrain
.
weight
,
expected_weight
,
strict
=
True
)
...
@@ -588,7 +574,6 @@ def test_dataset_construction_overwrites_user_provided_metadata_fields():
...
@@ -588,7 +574,6 @@ def test_dataset_construction_overwrites_user_provided_metadata_fields():
def
test_choose_param_value
():
def
test_choose_param_value
():
original_params
=
{
original_params
=
{
"local_listen_port"
:
1234
,
"local_listen_port"
:
1234
,
"port"
:
2222
,
"port"
:
2222
,
...
@@ -599,30 +584,20 @@ def test_choose_param_value():
...
@@ -599,30 +584,20 @@ def test_choose_param_value():
# should resolve duplicate aliases, and prefer the main parameter
# should resolve duplicate aliases, and prefer the main parameter
params
=
lgb
.
basic
.
_choose_param_value
(
params
=
lgb
.
basic
.
_choose_param_value
(
main_param_name
=
"local_listen_port"
,
main_param_name
=
"local_listen_port"
,
params
=
original_params
,
default_value
=
5555
params
=
original_params
,
default_value
=
5555
)
)
assert
params
[
"local_listen_port"
]
==
1234
assert
params
[
"local_listen_port"
]
==
1234
assert
"port"
not
in
params
assert
"port"
not
in
params
# should choose the highest priority alias and set that value on main param
# should choose the highest priority alias and set that value on main param
# if only aliases are used
# if only aliases are used
params
=
lgb
.
basic
.
_choose_param_value
(
params
=
lgb
.
basic
.
_choose_param_value
(
main_param_name
=
"num_iterations"
,
params
=
params
,
default_value
=
17
)
main_param_name
=
"num_iterations"
,
params
=
params
,
default_value
=
17
)
assert
params
[
"num_iterations"
]
==
13
assert
params
[
"num_iterations"
]
==
13
assert
"num_trees"
not
in
params
assert
"num_trees"
not
in
params
assert
"n_iter"
not
in
params
assert
"n_iter"
not
in
params
# should use the default if main param and aliases are missing
# should use the default if main param and aliases are missing
params
=
lgb
.
basic
.
_choose_param_value
(
params
=
lgb
.
basic
.
_choose_param_value
(
main_param_name
=
"learning_rate"
,
params
=
params
,
default_value
=
0.789
)
main_param_name
=
"learning_rate"
,
params
=
params
,
default_value
=
0.789
)
assert
params
[
"learning_rate"
]
==
0.789
assert
params
[
"learning_rate"
]
==
0.789
# all changes should be made on copies and not modify the original
# all changes should be made on copies and not modify the original
...
@@ -637,37 +612,23 @@ def test_choose_param_value():
...
@@ -637,37 +612,23 @@ def test_choose_param_value():
def
test_choose_param_value_preserves_nones
():
def
test_choose_param_value_preserves_nones
():
# preserves None found for main param and still removes aliases
# preserves None found for main param and still removes aliases
params
=
lgb
.
basic
.
_choose_param_value
(
params
=
lgb
.
basic
.
_choose_param_value
(
main_param_name
=
"num_threads"
,
main_param_name
=
"num_threads"
,
params
=
{
params
=
{
"num_threads"
:
None
,
"n_jobs"
:
4
,
"objective"
:
"regression"
},
"num_threads"
:
None
,
default_value
=
2
,
"n_jobs"
:
4
,
"objective"
:
"regression"
},
default_value
=
2
)
)
assert
params
==
{
"num_threads"
:
None
,
"objective"
:
"regression"
}
assert
params
==
{
"num_threads"
:
None
,
"objective"
:
"regression"
}
# correctly chooses value when only an alias is provided
# correctly chooses value when only an alias is provided
params
=
lgb
.
basic
.
_choose_param_value
(
params
=
lgb
.
basic
.
_choose_param_value
(
main_param_name
=
"num_threads"
,
main_param_name
=
"num_threads"
,
params
=
{
"n_jobs"
:
None
,
"objective"
:
"regression"
},
default_value
=
2
params
=
{
"n_jobs"
:
None
,
"objective"
:
"regression"
},
default_value
=
2
)
)
assert
params
==
{
"num_threads"
:
None
,
"objective"
:
"regression"
}
assert
params
==
{
"num_threads"
:
None
,
"objective"
:
"regression"
}
# adds None if that's given as the default and param not found
# adds None if that's given as the default and param not found
params
=
lgb
.
basic
.
_choose_param_value
(
params
=
lgb
.
basic
.
_choose_param_value
(
main_param_name
=
"min_data_in_leaf"
,
main_param_name
=
"min_data_in_leaf"
,
params
=
{
"objective"
:
"regression"
},
default_value
=
None
params
=
{
"objective"
:
"regression"
},
default_value
=
None
)
)
assert
params
==
{
"objective"
:
"regression"
,
"min_data_in_leaf"
:
None
}
assert
params
==
{
"objective"
:
"regression"
,
"min_data_in_leaf"
:
None
}
...
@@ -676,51 +637,39 @@ def test_choose_param_value_preserves_nones():
...
@@ -676,51 +637,39 @@ def test_choose_param_value_preserves_nones():
def
test_choose_param_value_objective
(
objective_alias
):
def
test_choose_param_value_objective
(
objective_alias
):
# If callable is found in objective
# If callable is found in objective
params
=
{
objective_alias
:
dummy_obj
}
params
=
{
objective_alias
:
dummy_obj
}
params
=
lgb
.
basic
.
_choose_param_value
(
params
=
lgb
.
basic
.
_choose_param_value
(
main_param_name
=
"objective"
,
params
=
params
,
default_value
=
None
)
main_param_name
=
"objective"
,
assert
params
[
"objective"
]
==
dummy_obj
params
=
params
,
default_value
=
None
)
assert
params
[
'objective'
]
==
dummy_obj
# Value in params should be preferred to the default_value passed from keyword arguments
# Value in params should be preferred to the default_value passed from keyword arguments
params
=
{
objective_alias
:
dummy_obj
}
params
=
{
objective_alias
:
dummy_obj
}
params
=
lgb
.
basic
.
_choose_param_value
(
params
=
lgb
.
basic
.
_choose_param_value
(
main_param_name
=
"objective"
,
params
=
params
,
default_value
=
mse_obj
)
main_param_name
=
"objective"
,
assert
params
[
"objective"
]
==
dummy_obj
params
=
params
,
default_value
=
mse_obj
)
assert
params
[
'objective'
]
==
dummy_obj
# None of objective or its aliases in params, but default_value is callable.
# None of objective or its aliases in params, but default_value is callable.
params
=
{}
params
=
{}
params
=
lgb
.
basic
.
_choose_param_value
(
params
=
lgb
.
basic
.
_choose_param_value
(
main_param_name
=
"objective"
,
params
=
params
,
default_value
=
mse_obj
)
main_param_name
=
"objective"
,
assert
params
[
"objective"
]
==
mse_obj
params
=
params
,
default_value
=
mse_obj
)
assert
params
[
'objective'
]
==
mse_obj
@
pytest
.
mark
.
parametrize
(
'
collection
'
,
[
'
1d_np
'
,
'
2d_np
'
,
'
pd_float
'
,
'
pd_str
'
,
'
1d_list
'
,
'
2d_list
'
])
@
pytest
.
mark
.
parametrize
(
"
collection
"
,
[
"
1d_np
"
,
"
2d_np
"
,
"
pd_float
"
,
"
pd_str
"
,
"
1d_list
"
,
"
2d_list
"
])
@
pytest
.
mark
.
parametrize
(
'
dtype
'
,
[
np
.
float32
,
np
.
float64
])
@
pytest
.
mark
.
parametrize
(
"
dtype
"
,
[
np
.
float32
,
np
.
float64
])
def
test_list_to_1d_numpy
(
collection
,
dtype
):
def
test_list_to_1d_numpy
(
collection
,
dtype
):
collection2y
=
{
collection2y
=
{
'
1d_np
'
:
np
.
random
.
rand
(
10
),
"
1d_np
"
:
np
.
random
.
rand
(
10
),
'
2d_np
'
:
np
.
random
.
rand
(
10
,
1
),
"
2d_np
"
:
np
.
random
.
rand
(
10
,
1
),
'
pd_float
'
:
np
.
random
.
rand
(
10
),
"
pd_float
"
:
np
.
random
.
rand
(
10
),
'
pd_str
'
:
[
'a'
,
'b'
],
"
pd_str
"
:
[
"a"
,
"b"
],
'
1d_list
'
:
[
1
]
*
10
,
"
1d_list
"
:
[
1
]
*
10
,
'
2d_list
'
:
[[
1
],
[
2
]],
"
2d_list
"
:
[[
1
],
[
2
]],
}
}
y
=
collection2y
[
collection
]
y
=
collection2y
[
collection
]
if
collection
.
startswith
(
'
pd
'
):
if
collection
.
startswith
(
"
pd
"
):
if
not
PANDAS_INSTALLED
:
if
not
PANDAS_INSTALLED
:
pytest
.
skip
(
'
pandas is not installed
'
)
pytest
.
skip
(
"
pandas is not installed
"
)
else
:
else
:
y
=
pd_Series
(
y
)
y
=
pd_Series
(
y
)
if
isinstance
(
y
,
np
.
ndarray
)
and
len
(
y
.
shape
)
==
2
:
if
isinstance
(
y
,
np
.
ndarray
)
and
len
(
y
.
shape
)
==
2
:
with
pytest
.
warns
(
UserWarning
,
match
=
'
column-vector
'
):
with
pytest
.
warns
(
UserWarning
,
match
=
"
column-vector
"
):
lgb
.
basic
.
_list_to_1d_numpy
(
y
,
dtype
=
np
.
float32
,
name
=
"list"
)
lgb
.
basic
.
_list_to_1d_numpy
(
y
,
dtype
=
np
.
float32
,
name
=
"list"
)
return
return
elif
isinstance
(
y
,
list
)
and
isinstance
(
y
[
0
],
list
):
elif
isinstance
(
y
,
list
)
and
isinstance
(
y
[
0
],
list
):
...
@@ -736,30 +685,31 @@ def test_list_to_1d_numpy(collection, dtype):
...
@@ -736,30 +685,31 @@ def test_list_to_1d_numpy(collection, dtype):
assert
result
.
dtype
==
dtype
assert
result
.
dtype
==
dtype
@
pytest
.
mark
.
parametrize
(
'
init_score_type
'
,
[
'
array
'
,
'
dataframe
'
,
'
list
'
])
@
pytest
.
mark
.
parametrize
(
"
init_score_type
"
,
[
"
array
"
,
"
dataframe
"
,
"
list
"
])
def
test_init_score_for_multiclass_classification
(
init_score_type
):
def
test_init_score_for_multiclass_classification
(
init_score_type
):
init_score
=
[[
i
*
10
+
j
for
j
in
range
(
3
)]
for
i
in
range
(
10
)]
init_score
=
[[
i
*
10
+
j
for
j
in
range
(
3
)]
for
i
in
range
(
10
)]
if
init_score_type
==
'
array
'
:
if
init_score_type
==
"
array
"
:
init_score
=
np
.
array
(
init_score
)
init_score
=
np
.
array
(
init_score
)
elif
init_score_type
==
'
dataframe
'
:
elif
init_score_type
==
"
dataframe
"
:
if
not
PANDAS_INSTALLED
:
if
not
PANDAS_INSTALLED
:
pytest
.
skip
(
'
Pandas is not installed.
'
)
pytest
.
skip
(
"
Pandas is not installed.
"
)
init_score
=
pd_DataFrame
(
init_score
)
init_score
=
pd_DataFrame
(
init_score
)
data
=
np
.
random
.
rand
(
10
,
2
)
data
=
np
.
random
.
rand
(
10
,
2
)
ds
=
lgb
.
Dataset
(
data
,
init_score
=
init_score
).
construct
()
ds
=
lgb
.
Dataset
(
data
,
init_score
=
init_score
).
construct
()
np
.
testing
.
assert_equal
(
ds
.
get_field
(
'
init_score
'
),
init_score
)
np
.
testing
.
assert_equal
(
ds
.
get_field
(
"
init_score
"
),
init_score
)
np
.
testing
.
assert_equal
(
ds
.
init_score
,
init_score
)
np
.
testing
.
assert_equal
(
ds
.
init_score
,
init_score
)
def
test_smoke_custom_parser
(
tmp_path
):
def
test_smoke_custom_parser
(
tmp_path
):
data_path
=
Path
(
__file__
).
absolute
().
parents
[
2
]
/
'
examples
'
/
'
binary_classification
'
/
'
binary.train
'
data_path
=
Path
(
__file__
).
absolute
().
parents
[
2
]
/
"
examples
"
/
"
binary_classification
"
/
"
binary.train
"
parser_config_file
=
tmp_path
/
'
parser.ini
'
parser_config_file
=
tmp_path
/
"
parser.ini
"
with
open
(
parser_config_file
,
'w'
)
as
fout
:
with
open
(
parser_config_file
,
"w"
)
as
fout
:
fout
.
write
(
'{"className": "dummy", "id": "1"}'
)
fout
.
write
(
'{"className": "dummy", "id": "1"}'
)
data
=
lgb
.
Dataset
(
data_path
,
params
=
{
"parser_config_file"
:
parser_config_file
})
data
=
lgb
.
Dataset
(
data_path
,
params
=
{
"parser_config_file"
:
parser_config_file
})
with
pytest
.
raises
(
lgb
.
basic
.
LightGBMError
,
with
pytest
.
raises
(
match
=
"Cannot find parser class 'dummy', please register first or check config format"
):
lgb
.
basic
.
LightGBMError
,
match
=
"Cannot find parser class 'dummy', please register first or check config format"
):
data
.
construct
()
data
.
construct
()
...
@@ -770,9 +720,13 @@ def test_param_aliases():
...
@@ -770,9 +720,13 @@ def test_param_aliases():
assert
all
(
isinstance
(
i
,
list
)
for
i
in
aliases
.
values
())
assert
all
(
isinstance
(
i
,
list
)
for
i
in
aliases
.
values
())
assert
all
(
len
(
i
)
>=
1
for
i
in
aliases
.
values
())
assert
all
(
len
(
i
)
>=
1
for
i
in
aliases
.
values
())
assert
all
(
k
in
v
for
k
,
v
in
aliases
.
items
())
assert
all
(
k
in
v
for
k
,
v
in
aliases
.
items
())
assert
lgb
.
basic
.
_ConfigAliases
.
get
(
'config'
,
'task'
)
==
{
'config'
,
'config_file'
,
'task'
,
'task_type'
}
assert
lgb
.
basic
.
_ConfigAliases
.
get
(
"config"
,
"task"
)
==
{
"config"
,
"config_file"
,
"task"
,
"task_type"
}
assert
lgb
.
basic
.
_ConfigAliases
.
get_sorted
(
'min_data_in_leaf'
)
==
[
assert
lgb
.
basic
.
_ConfigAliases
.
get_sorted
(
"min_data_in_leaf"
)
==
[
'min_data_in_leaf'
,
'min_data'
,
'min_samples_leaf'
,
'min_child_samples'
,
'min_data_per_leaf'
"min_data_in_leaf"
,
"min_data"
,
"min_samples_leaf"
,
"min_child_samples"
,
"min_data_per_leaf"
,
]
]
...
@@ -793,10 +747,10 @@ def test_custom_objective_safety():
...
@@ -793,10 +747,10 @@ def test_custom_objective_safety():
y_multiclass
=
np
.
arange
(
nrows
)
%
nclass
y_multiclass
=
np
.
arange
(
nrows
)
%
nclass
ds_binary
=
lgb
.
Dataset
(
X
,
y_binary
).
construct
()
ds_binary
=
lgb
.
Dataset
(
X
,
y_binary
).
construct
()
ds_multiclass
=
lgb
.
Dataset
(
X
,
y_multiclass
).
construct
()
ds_multiclass
=
lgb
.
Dataset
(
X
,
y_multiclass
).
construct
()
bad_bst_binary
=
lgb
.
Booster
({
'
objective
'
:
"none"
},
ds_binary
)
bad_bst_binary
=
lgb
.
Booster
({
"
objective
"
:
"none"
},
ds_binary
)
good_bst_binary
=
lgb
.
Booster
({
'
objective
'
:
"none"
},
ds_binary
)
good_bst_binary
=
lgb
.
Booster
({
"
objective
"
:
"none"
},
ds_binary
)
bad_bst_multi
=
lgb
.
Booster
({
'
objective
'
:
"none"
,
"num_class"
:
nclass
},
ds_multiclass
)
bad_bst_multi
=
lgb
.
Booster
({
"
objective
"
:
"none"
,
"num_class"
:
nclass
},
ds_multiclass
)
good_bst_multi
=
lgb
.
Booster
({
'
objective
'
:
"none"
,
"num_class"
:
nclass
},
ds_multiclass
)
good_bst_multi
=
lgb
.
Booster
({
"
objective
"
:
"none"
,
"num_class"
:
nclass
},
ds_multiclass
)
good_bst_binary
.
update
(
fobj
=
_good_gradients
)
good_bst_binary
.
update
(
fobj
=
_good_gradients
)
with
pytest
.
raises
(
ValueError
,
match
=
re
.
escape
(
"number of models per one iteration (1)"
)):
with
pytest
.
raises
(
ValueError
,
match
=
re
.
escape
(
"number of models per one iteration (1)"
)):
bad_bst_binary
.
update
(
fobj
=
_bad_gradients
)
bad_bst_binary
.
update
(
fobj
=
_bad_gradients
)
...
@@ -805,33 +759,30 @@ def test_custom_objective_safety():
...
@@ -805,33 +759,30 @@ def test_custom_objective_safety():
bad_bst_multi
.
update
(
fobj
=
_bad_gradients
)
bad_bst_multi
.
update
(
fobj
=
_bad_gradients
)
@
pytest
.
mark
.
parametrize
(
'
dtype
'
,
[
np
.
float32
,
np
.
float64
])
@
pytest
.
mark
.
parametrize
(
"
dtype
"
,
[
np
.
float32
,
np
.
float64
])
@
pytest
.
mark
.
parametrize
(
'
feature_name
'
,
[[
'
x1
'
,
'
x2
'
],
'
auto
'
])
@
pytest
.
mark
.
parametrize
(
"
feature_name
"
,
[[
"
x1
"
,
"
x2
"
],
"
auto
"
])
def
test_no_copy_when_single_float_dtype_dataframe
(
dtype
,
feature_name
):
def
test_no_copy_when_single_float_dtype_dataframe
(
dtype
,
feature_name
):
pd
=
pytest
.
importorskip
(
'
pandas
'
)
pd
=
pytest
.
importorskip
(
"
pandas
"
)
X
=
np
.
random
.
rand
(
10
,
2
).
astype
(
dtype
)
X
=
np
.
random
.
rand
(
10
,
2
).
astype
(
dtype
)
df
=
pd
.
DataFrame
(
X
)
df
=
pd
.
DataFrame
(
X
)
built_data
=
lgb
.
basic
.
_data_from_pandas
(
built_data
=
lgb
.
basic
.
_data_from_pandas
(
data
=
df
,
data
=
df
,
feature_name
=
feature_name
,
categorical_feature
=
"auto"
,
pandas_categorical
=
None
feature_name
=
feature_name
,
categorical_feature
=
"auto"
,
pandas_categorical
=
None
)[
0
]
)[
0
]
assert
built_data
.
dtype
==
dtype
assert
built_data
.
dtype
==
dtype
assert
np
.
shares_memory
(
X
,
built_data
)
assert
np
.
shares_memory
(
X
,
built_data
)
@
pytest
.
mark
.
parametrize
(
'
feature_name
'
,
[[
'
x1
'
],
[
42
],
'
auto
'
])
@
pytest
.
mark
.
parametrize
(
"
feature_name
"
,
[[
"
x1
"
],
[
42
],
"
auto
"
])
@
pytest
.
mark
.
parametrize
(
'
categories
'
,
[
'
seen
'
,
'
unseen
'
])
@
pytest
.
mark
.
parametrize
(
"
categories
"
,
[
"
seen
"
,
"
unseen
"
])
def
test_categorical_code_conversion_doesnt_modify_original_data
(
feature_name
,
categories
):
def
test_categorical_code_conversion_doesnt_modify_original_data
(
feature_name
,
categories
):
pd
=
pytest
.
importorskip
(
'
pandas
'
)
pd
=
pytest
.
importorskip
(
"
pandas
"
)
X
=
np
.
random
.
choice
([
'a'
,
'b'
],
100
).
reshape
(
-
1
,
1
)
X
=
np
.
random
.
choice
([
"a"
,
"b"
],
100
).
reshape
(
-
1
,
1
)
column_name
=
'a'
if
feature_name
==
'
auto
'
else
feature_name
[
0
]
column_name
=
"a"
if
feature_name
==
"
auto
"
else
feature_name
[
0
]
df
=
pd
.
DataFrame
(
X
.
copy
(),
columns
=
[
column_name
],
dtype
=
'
category
'
)
df
=
pd
.
DataFrame
(
X
.
copy
(),
columns
=
[
column_name
],
dtype
=
"
category
"
)
if
categories
==
'
seen
'
:
if
categories
==
"
seen
"
:
pandas_categorical
=
[[
'a'
,
'b'
]]
pandas_categorical
=
[[
"a"
,
"b"
]]
else
:
else
:
pandas_categorical
=
[[
'a'
]]
pandas_categorical
=
[[
"a"
]]
data
=
lgb
.
basic
.
_data_from_pandas
(
data
=
lgb
.
basic
.
_data_from_pandas
(
data
=
df
,
data
=
df
,
feature_name
=
feature_name
,
feature_name
=
feature_name
,
...
@@ -841,31 +792,33 @@ def test_categorical_code_conversion_doesnt_modify_original_data(feature_name, c
...
@@ -841,31 +792,33 @@ def test_categorical_code_conversion_doesnt_modify_original_data(feature_name, c
# check that the original data wasn't modified
# check that the original data wasn't modified
np
.
testing
.
assert_equal
(
df
[
column_name
],
X
[:,
0
])
np
.
testing
.
assert_equal
(
df
[
column_name
],
X
[:,
0
])
# check that the built data has the codes
# check that the built data has the codes
if
categories
==
'
seen
'
:
if
categories
==
"
seen
"
:
# if all categories were seen during training we just take the codes
# if all categories were seen during training we just take the codes
codes
=
df
[
column_name
].
cat
.
codes
codes
=
df
[
column_name
].
cat
.
codes
else
:
else
:
# if we only saw 'a' during training we just replace its code
# if we only saw 'a' during training we just replace its code
# and leave the rest as nan
# and leave the rest as nan
a_code
=
df
[
column_name
].
cat
.
categories
.
get_loc
(
'a'
)
a_code
=
df
[
column_name
].
cat
.
categories
.
get_loc
(
"a"
)
codes
=
np
.
where
(
df
[
column_name
]
==
'a'
,
a_code
,
np
.
nan
)
codes
=
np
.
where
(
df
[
column_name
]
==
"a"
,
a_code
,
np
.
nan
)
np
.
testing
.
assert_equal
(
codes
,
data
[:,
0
])
np
.
testing
.
assert_equal
(
codes
,
data
[:,
0
])
@
pytest
.
mark
.
parametrize
(
'
min_data_in_bin
'
,
[
2
,
10
])
@
pytest
.
mark
.
parametrize
(
"
min_data_in_bin
"
,
[
2
,
10
])
def
test_feature_num_bin
(
min_data_in_bin
):
def
test_feature_num_bin
(
min_data_in_bin
):
X
=
np
.
vstack
([
X
=
np
.
vstack
(
np
.
random
.
rand
(
100
),
[
np
.
array
([
1
,
2
]
*
50
),
np
.
random
.
rand
(
100
),
np
.
array
([
0
,
1
,
2
]
*
33
+
[
0
]),
np
.
array
([
1
,
2
]
*
50
),
np
.
array
([
1
,
2
]
*
49
+
2
*
[
np
.
nan
]),
np
.
array
([
0
,
1
,
2
]
*
33
+
[
0
]),
np
.
zeros
(
100
),
np
.
array
([
1
,
2
]
*
49
+
2
*
[
np
.
nan
]),
np
.
random
.
choice
([
0
,
1
],
100
),
np
.
zeros
(
100
),
]).
T
np
.
random
.
choice
([
0
,
1
],
100
),
]
).
T
n_continuous
=
X
.
shape
[
1
]
-
1
n_continuous
=
X
.
shape
[
1
]
-
1
feature_name
=
[
f
'
x
{
i
}
'
for
i
in
range
(
n_continuous
)]
+
[
'
cat1
'
]
feature_name
=
[
f
"
x
{
i
}
"
for
i
in
range
(
n_continuous
)]
+
[
"
cat1
"
]
ds_kwargs
=
{
ds_kwargs
=
{
"params"
:
{
'
min_data_in_bin
'
:
min_data_in_bin
},
"params"
:
{
"
min_data_in_bin
"
:
min_data_in_bin
},
"categorical_feature"
:
[
n_continuous
],
# last feature
"categorical_feature"
:
[
n_continuous
],
# last feature
}
}
ds
=
lgb
.
Dataset
(
X
,
feature_name
=
feature_name
,
**
ds_kwargs
).
construct
()
ds
=
lgb
.
Dataset
(
X
,
feature_name
=
feature_name
,
**
ds_kwargs
).
construct
()
...
@@ -884,7 +837,7 @@ def test_feature_num_bin(min_data_in_bin):
...
@@ -884,7 +837,7 @@ def test_feature_num_bin(min_data_in_bin):
assert
bins_by_name
==
expected_num_bins
assert
bins_by_name
==
expected_num_bins
# test using default feature names
# test using default feature names
ds_no_names
=
lgb
.
Dataset
(
X
,
**
ds_kwargs
).
construct
()
ds_no_names
=
lgb
.
Dataset
(
X
,
**
ds_kwargs
).
construct
()
default_names
=
[
f
'
Column_
{
i
}
'
for
i
in
range
(
X
.
shape
[
1
])]
default_names
=
[
f
"
Column_
{
i
}
"
for
i
in
range
(
X
.
shape
[
1
])]
bins_by_default_name
=
[
ds_no_names
.
feature_num_bin
(
name
)
for
name
in
default_names
]
bins_by_default_name
=
[
ds_no_names
.
feature_num_bin
(
name
)
for
name
in
default_names
]
assert
bins_by_default_name
==
expected_num_bins
assert
bins_by_default_name
==
expected_num_bins
# check for feature indices outside of range
# check for feature indices outside of range
...
@@ -892,9 +845,9 @@ def test_feature_num_bin(min_data_in_bin):
...
@@ -892,9 +845,9 @@ def test_feature_num_bin(min_data_in_bin):
with
pytest
.
raises
(
with
pytest
.
raises
(
lgb
.
basic
.
LightGBMError
,
lgb
.
basic
.
LightGBMError
,
match
=
(
match
=
(
f
'
Tried to retrieve number of bins for feature index
{
num_features
}
,
'
f
"
Tried to retrieve number of bins for feature index
{
num_features
}
,
"
f
'
but the valid feature indices are
\\
[0,
{
num_features
-
1
}
\\
].
'
f
"
but the valid feature indices are
\\
[0,
{
num_features
-
1
}
\\
].
"
)
)
,
):
):
ds
.
feature_num_bin
(
num_features
)
ds
.
feature_num_bin
(
num_features
)
...
@@ -902,7 +855,7 @@ def test_feature_num_bin(min_data_in_bin):
...
@@ -902,7 +855,7 @@ def test_feature_num_bin(min_data_in_bin):
def
test_feature_num_bin_with_max_bin_by_feature
():
def
test_feature_num_bin_with_max_bin_by_feature
():
X
=
np
.
random
.
rand
(
100
,
3
)
X
=
np
.
random
.
rand
(
100
,
3
)
max_bin_by_feature
=
np
.
random
.
randint
(
3
,
30
,
size
=
X
.
shape
[
1
])
max_bin_by_feature
=
np
.
random
.
randint
(
3
,
30
,
size
=
X
.
shape
[
1
])
ds
=
lgb
.
Dataset
(
X
,
params
=
{
'
max_bin_by_feature
'
:
max_bin_by_feature
}).
construct
()
ds
=
lgb
.
Dataset
(
X
,
params
=
{
"
max_bin_by_feature
"
:
max_bin_by_feature
}).
construct
()
actual_num_bins
=
[
ds
.
feature_num_bin
(
i
)
for
i
in
range
(
X
.
shape
[
1
])]
actual_num_bins
=
[
ds
.
feature_num_bin
(
i
)
for
i
in
range
(
X
.
shape
[
1
])]
np
.
testing
.
assert_equal
(
actual_num_bins
,
max_bin_by_feature
)
np
.
testing
.
assert_equal
(
actual_num_bins
,
max_bin_by_feature
)
...
@@ -910,7 +863,7 @@ def test_feature_num_bin_with_max_bin_by_feature():
...
@@ -910,7 +863,7 @@ def test_feature_num_bin_with_max_bin_by_feature():
def
test_set_leaf_output
():
def
test_set_leaf_output
():
X
,
y
=
load_breast_cancer
(
return_X_y
=
True
)
X
,
y
=
load_breast_cancer
(
return_X_y
=
True
)
ds
=
lgb
.
Dataset
(
X
,
y
)
ds
=
lgb
.
Dataset
(
X
,
y
)
bst
=
lgb
.
Booster
({
'
num_leaves
'
:
2
},
ds
)
bst
=
lgb
.
Booster
({
"
num_leaves
"
:
2
},
ds
)
bst
.
update
()
bst
.
update
()
y_pred
=
bst
.
predict
(
X
)
y_pred
=
bst
.
predict
(
X
)
for
leaf_id
in
range
(
2
):
for
leaf_id
in
range
(
2
):
...
...
tests/python_package_test/test_callback.py
View file @
1b792e71
...
@@ -10,7 +10,7 @@ def reset_feature_fraction(boosting_round):
...
@@ -10,7 +10,7 @@ def reset_feature_fraction(boosting_round):
return
0.6
if
boosting_round
<
15
else
0.8
return
0.6
if
boosting_round
<
15
else
0.8
@
pytest
.
mark
.
parametrize
(
'
serializer
'
,
SERIALIZERS
)
@
pytest
.
mark
.
parametrize
(
"
serializer
"
,
SERIALIZERS
)
def
test_early_stopping_callback_is_picklable
(
serializer
):
def
test_early_stopping_callback_is_picklable
(
serializer
):
rounds
=
5
rounds
=
5
callback
=
lgb
.
early_stopping
(
stopping_rounds
=
rounds
)
callback
=
lgb
.
early_stopping
(
stopping_rounds
=
rounds
)
...
@@ -32,7 +32,7 @@ def test_early_stopping_callback_rejects_invalid_stopping_rounds_with_informativ
...
@@ -32,7 +32,7 @@ def test_early_stopping_callback_rejects_invalid_stopping_rounds_with_informativ
lgb
.
early_stopping
(
stopping_rounds
=
"neverrrr"
)
lgb
.
early_stopping
(
stopping_rounds
=
"neverrrr"
)
@
pytest
.
mark
.
parametrize
(
'
serializer
'
,
SERIALIZERS
)
@
pytest
.
mark
.
parametrize
(
"
serializer
"
,
SERIALIZERS
)
def
test_log_evaluation_callback_is_picklable
(
serializer
):
def
test_log_evaluation_callback_is_picklable
(
serializer
):
periods
=
42
periods
=
42
callback
=
lgb
.
log_evaluation
(
period
=
periods
)
callback
=
lgb
.
log_evaluation
(
period
=
periods
)
...
@@ -43,7 +43,7 @@ def test_log_evaluation_callback_is_picklable(serializer):
...
@@ -43,7 +43,7 @@ def test_log_evaluation_callback_is_picklable(serializer):
assert
callback
.
period
==
periods
assert
callback
.
period
==
periods
@
pytest
.
mark
.
parametrize
(
'
serializer
'
,
SERIALIZERS
)
@
pytest
.
mark
.
parametrize
(
"
serializer
"
,
SERIALIZERS
)
def
test_record_evaluation_callback_is_picklable
(
serializer
):
def
test_record_evaluation_callback_is_picklable
(
serializer
):
results
=
{}
results
=
{}
callback
=
lgb
.
record_evaluation
(
eval_result
=
results
)
callback
=
lgb
.
record_evaluation
(
eval_result
=
results
)
...
@@ -54,12 +54,9 @@ def test_record_evaluation_callback_is_picklable(serializer):
...
@@ -54,12 +54,9 @@ def test_record_evaluation_callback_is_picklable(serializer):
assert
callback
.
eval_result
is
results
assert
callback
.
eval_result
is
results
@
pytest
.
mark
.
parametrize
(
'
serializer
'
,
SERIALIZERS
)
@
pytest
.
mark
.
parametrize
(
"
serializer
"
,
SERIALIZERS
)
def
test_reset_parameter_callback_is_picklable
(
serializer
):
def
test_reset_parameter_callback_is_picklable
(
serializer
):
params
=
{
params
=
{
"bagging_fraction"
:
[
0.7
]
*
5
+
[
0.6
]
*
5
,
"feature_fraction"
:
reset_feature_fraction
}
'bagging_fraction'
:
[
0.7
]
*
5
+
[
0.6
]
*
5
,
'feature_fraction'
:
reset_feature_fraction
}
callback
=
lgb
.
reset_parameter
(
**
params
)
callback
=
lgb
.
reset_parameter
(
**
params
)
callback_from_disk
=
pickle_and_unpickle_object
(
obj
=
callback
,
serializer
=
serializer
)
callback_from_disk
=
pickle_and_unpickle_object
(
obj
=
callback
,
serializer
=
serializer
)
assert
callback_from_disk
.
order
==
10
assert
callback_from_disk
.
order
==
10
...
...
tests/python_package_test/test_consistency.py
View file @
1b792e71
...
@@ -6,22 +6,21 @@ from sklearn.datasets import load_svmlight_file
...
@@ -6,22 +6,21 @@ from sklearn.datasets import load_svmlight_file
import
lightgbm
as
lgb
import
lightgbm
as
lgb
EXAMPLES_DIR
=
Path
(
__file__
).
absolute
().
parents
[
2
]
/
'
examples
'
EXAMPLES_DIR
=
Path
(
__file__
).
absolute
().
parents
[
2
]
/
"
examples
"
class
FileLoader
:
class
FileLoader
:
def
__init__
(
self
,
directory
,
prefix
,
config_file
=
"train.conf"
):
def
__init__
(
self
,
directory
,
prefix
,
config_file
=
'train.conf'
):
self
.
directory
=
directory
self
.
directory
=
directory
self
.
prefix
=
prefix
self
.
prefix
=
prefix
self
.
params
=
{
'
gpu_use_dp
'
:
True
}
self
.
params
=
{
"
gpu_use_dp
"
:
True
}
with
open
(
self
.
directory
/
config_file
,
'r'
)
as
f
:
with
open
(
self
.
directory
/
config_file
,
"r"
)
as
f
:
for
line
in
f
.
readlines
():
for
line
in
f
.
readlines
():
line
=
line
.
strip
()
line
=
line
.
strip
()
if
line
and
not
line
.
startswith
(
'#'
):
if
line
and
not
line
.
startswith
(
"#"
):
key
,
value
=
[
token
.
strip
()
for
token
in
line
.
split
(
'='
)]
key
,
value
=
[
token
.
strip
()
for
token
in
line
.
split
(
"="
)]
if
'
early_stopping
'
not
in
key
:
# disable early_stopping
if
"
early_stopping
"
not
in
key
:
# disable early_stopping
self
.
params
[
key
]
=
value
if
key
not
in
{
'
num_trees
'
,
'
num_threads
'
}
else
int
(
value
)
self
.
params
[
key
]
=
value
if
key
not
in
{
"
num_trees
"
,
"
num_threads
"
}
else
int
(
value
)
def
load_dataset
(
self
,
suffix
,
is_sparse
=
False
):
def
load_dataset
(
self
,
suffix
,
is_sparse
=
False
):
filename
=
str
(
self
.
path
(
suffix
))
filename
=
str
(
self
.
path
(
suffix
))
...
@@ -33,14 +32,14 @@ class FileLoader:
...
@@ -33,14 +32,14 @@ class FileLoader:
return
mat
[:,
1
:],
mat
[:,
0
],
filename
return
mat
[:,
1
:],
mat
[:,
0
],
filename
def
load_field
(
self
,
suffix
):
def
load_field
(
self
,
suffix
):
return
np
.
loadtxt
(
str
(
self
.
directory
/
f
'
{
self
.
prefix
}{
suffix
}
'
))
return
np
.
loadtxt
(
str
(
self
.
directory
/
f
"
{
self
.
prefix
}{
suffix
}
"
))
def
load_cpp_result
(
self
,
result_file
=
'
LightGBM_predict_result.txt
'
):
def
load_cpp_result
(
self
,
result_file
=
"
LightGBM_predict_result.txt
"
):
return
np
.
loadtxt
(
str
(
self
.
directory
/
result_file
))
return
np
.
loadtxt
(
str
(
self
.
directory
/
result_file
))
def
train_predict_check
(
self
,
lgb_train
,
X_test
,
X_test_fn
,
sk_pred
):
def
train_predict_check
(
self
,
lgb_train
,
X_test
,
X_test_fn
,
sk_pred
):
params
=
dict
(
self
.
params
)
params
=
dict
(
self
.
params
)
params
[
'
force_row_wise
'
]
=
True
params
[
"
force_row_wise
"
]
=
True
gbm
=
lgb
.
train
(
params
,
lgb_train
)
gbm
=
lgb
.
train
(
params
,
lgb_train
)
y_pred
=
gbm
.
predict
(
X_test
)
y_pred
=
gbm
.
predict
(
X_test
)
cpp_pred
=
gbm
.
predict
(
X_test_fn
)
cpp_pred
=
gbm
.
predict
(
X_test_fn
)
...
@@ -49,7 +48,7 @@ class FileLoader:
...
@@ -49,7 +48,7 @@ class FileLoader:
def
file_load_check
(
self
,
lgb_train
,
name
):
def
file_load_check
(
self
,
lgb_train
,
name
):
lgb_train_f
=
lgb
.
Dataset
(
self
.
path
(
name
),
params
=
self
.
params
).
construct
()
lgb_train_f
=
lgb
.
Dataset
(
self
.
path
(
name
),
params
=
self
.
params
).
construct
()
for
f
in
(
'
num_data
'
,
'
num_feature
'
,
'
get_label
'
,
'
get_weight
'
,
'
get_init_score
'
,
'
get_group
'
):
for
f
in
(
"
num_data
"
,
"
num_feature
"
,
"
get_label
"
,
"
get_weight
"
,
"
get_init_score
"
,
"
get_group
"
):
a
=
getattr
(
lgb_train
,
f
)()
a
=
getattr
(
lgb_train
,
f
)()
b
=
getattr
(
lgb_train_f
,
f
)()
b
=
getattr
(
lgb_train_f
,
f
)()
if
a
is
None
and
b
is
None
:
if
a
is
None
and
b
is
None
:
...
@@ -62,83 +61,83 @@ class FileLoader:
...
@@ -62,83 +61,83 @@ class FileLoader:
assert
a
==
b
,
f
assert
a
==
b
,
f
def
path
(
self
,
suffix
):
def
path
(
self
,
suffix
):
return
self
.
directory
/
f
'
{
self
.
prefix
}{
suffix
}
'
return
self
.
directory
/
f
"
{
self
.
prefix
}{
suffix
}
"
def
test_binary
():
def
test_binary
():
fd
=
FileLoader
(
EXAMPLES_DIR
/
'
binary_classification
'
,
'
binary
'
)
fd
=
FileLoader
(
EXAMPLES_DIR
/
"
binary_classification
"
,
"
binary
"
)
X_train
,
y_train
,
_
=
fd
.
load_dataset
(
'
.train
'
)
X_train
,
y_train
,
_
=
fd
.
load_dataset
(
"
.train
"
)
X_test
,
_
,
X_test_fn
=
fd
.
load_dataset
(
'
.test
'
)
X_test
,
_
,
X_test_fn
=
fd
.
load_dataset
(
"
.test
"
)
weight_train
=
fd
.
load_field
(
'
.train.weight
'
)
weight_train
=
fd
.
load_field
(
"
.train.weight
"
)
lgb_train
=
lgb
.
Dataset
(
X_train
,
y_train
,
params
=
fd
.
params
,
weight
=
weight_train
)
lgb_train
=
lgb
.
Dataset
(
X_train
,
y_train
,
params
=
fd
.
params
,
weight
=
weight_train
)
gbm
=
lgb
.
LGBMClassifier
(
**
fd
.
params
)
gbm
=
lgb
.
LGBMClassifier
(
**
fd
.
params
)
gbm
.
fit
(
X_train
,
y_train
,
sample_weight
=
weight_train
)
gbm
.
fit
(
X_train
,
y_train
,
sample_weight
=
weight_train
)
sk_pred
=
gbm
.
predict_proba
(
X_test
)[:,
1
]
sk_pred
=
gbm
.
predict_proba
(
X_test
)[:,
1
]
fd
.
train_predict_check
(
lgb_train
,
X_test
,
X_test_fn
,
sk_pred
)
fd
.
train_predict_check
(
lgb_train
,
X_test
,
X_test_fn
,
sk_pred
)
fd
.
file_load_check
(
lgb_train
,
'
.train
'
)
fd
.
file_load_check
(
lgb_train
,
"
.train
"
)
def
test_binary_linear
():
def
test_binary_linear
():
fd
=
FileLoader
(
EXAMPLES_DIR
/
'
binary_classification
'
,
'
binary
'
,
'
train_linear.conf
'
)
fd
=
FileLoader
(
EXAMPLES_DIR
/
"
binary_classification
"
,
"
binary
"
,
"
train_linear.conf
"
)
X_train
,
y_train
,
_
=
fd
.
load_dataset
(
'
.train
'
)
X_train
,
y_train
,
_
=
fd
.
load_dataset
(
"
.train
"
)
X_test
,
_
,
X_test_fn
=
fd
.
load_dataset
(
'
.test
'
)
X_test
,
_
,
X_test_fn
=
fd
.
load_dataset
(
"
.test
"
)
weight_train
=
fd
.
load_field
(
'
.train.weight
'
)
weight_train
=
fd
.
load_field
(
"
.train.weight
"
)
lgb_train
=
lgb
.
Dataset
(
X_train
,
y_train
,
params
=
fd
.
params
,
weight
=
weight_train
)
lgb_train
=
lgb
.
Dataset
(
X_train
,
y_train
,
params
=
fd
.
params
,
weight
=
weight_train
)
gbm
=
lgb
.
LGBMClassifier
(
**
fd
.
params
)
gbm
=
lgb
.
LGBMClassifier
(
**
fd
.
params
)
gbm
.
fit
(
X_train
,
y_train
,
sample_weight
=
weight_train
)
gbm
.
fit
(
X_train
,
y_train
,
sample_weight
=
weight_train
)
sk_pred
=
gbm
.
predict_proba
(
X_test
)[:,
1
]
sk_pred
=
gbm
.
predict_proba
(
X_test
)[:,
1
]
fd
.
train_predict_check
(
lgb_train
,
X_test
,
X_test_fn
,
sk_pred
)
fd
.
train_predict_check
(
lgb_train
,
X_test
,
X_test_fn
,
sk_pred
)
fd
.
file_load_check
(
lgb_train
,
'
.train
'
)
fd
.
file_load_check
(
lgb_train
,
"
.train
"
)
def
test_multiclass
():
def
test_multiclass
():
fd
=
FileLoader
(
EXAMPLES_DIR
/
'
multiclass_classification
'
,
'
multiclass
'
)
fd
=
FileLoader
(
EXAMPLES_DIR
/
"
multiclass_classification
"
,
"
multiclass
"
)
X_train
,
y_train
,
_
=
fd
.
load_dataset
(
'
.train
'
)
X_train
,
y_train
,
_
=
fd
.
load_dataset
(
"
.train
"
)
X_test
,
_
,
X_test_fn
=
fd
.
load_dataset
(
'
.test
'
)
X_test
,
_
,
X_test_fn
=
fd
.
load_dataset
(
"
.test
"
)
lgb_train
=
lgb
.
Dataset
(
X_train
,
y_train
)
lgb_train
=
lgb
.
Dataset
(
X_train
,
y_train
)
gbm
=
lgb
.
LGBMClassifier
(
**
fd
.
params
)
gbm
=
lgb
.
LGBMClassifier
(
**
fd
.
params
)
gbm
.
fit
(
X_train
,
y_train
)
gbm
.
fit
(
X_train
,
y_train
)
sk_pred
=
gbm
.
predict_proba
(
X_test
)
sk_pred
=
gbm
.
predict_proba
(
X_test
)
fd
.
train_predict_check
(
lgb_train
,
X_test
,
X_test_fn
,
sk_pred
)
fd
.
train_predict_check
(
lgb_train
,
X_test
,
X_test_fn
,
sk_pred
)
fd
.
file_load_check
(
lgb_train
,
'
.train
'
)
fd
.
file_load_check
(
lgb_train
,
"
.train
"
)
def
test_regression
():
def
test_regression
():
fd
=
FileLoader
(
EXAMPLES_DIR
/
'
regression
'
,
'
regression
'
)
fd
=
FileLoader
(
EXAMPLES_DIR
/
"
regression
"
,
"
regression
"
)
X_train
,
y_train
,
_
=
fd
.
load_dataset
(
'
.train
'
)
X_train
,
y_train
,
_
=
fd
.
load_dataset
(
"
.train
"
)
X_test
,
_
,
X_test_fn
=
fd
.
load_dataset
(
'
.test
'
)
X_test
,
_
,
X_test_fn
=
fd
.
load_dataset
(
"
.test
"
)
init_score_train
=
fd
.
load_field
(
'
.train.init
'
)
init_score_train
=
fd
.
load_field
(
"
.train.init
"
)
lgb_train
=
lgb
.
Dataset
(
X_train
,
y_train
,
init_score
=
init_score_train
)
lgb_train
=
lgb
.
Dataset
(
X_train
,
y_train
,
init_score
=
init_score_train
)
gbm
=
lgb
.
LGBMRegressor
(
**
fd
.
params
)
gbm
=
lgb
.
LGBMRegressor
(
**
fd
.
params
)
gbm
.
fit
(
X_train
,
y_train
,
init_score
=
init_score_train
)
gbm
.
fit
(
X_train
,
y_train
,
init_score
=
init_score_train
)
sk_pred
=
gbm
.
predict
(
X_test
)
sk_pred
=
gbm
.
predict
(
X_test
)
fd
.
train_predict_check
(
lgb_train
,
X_test
,
X_test_fn
,
sk_pred
)
fd
.
train_predict_check
(
lgb_train
,
X_test
,
X_test_fn
,
sk_pred
)
fd
.
file_load_check
(
lgb_train
,
'
.train
'
)
fd
.
file_load_check
(
lgb_train
,
"
.train
"
)
def
test_lambdarank
():
def
test_lambdarank
():
fd
=
FileLoader
(
EXAMPLES_DIR
/
'
lambdarank
'
,
'
rank
'
)
fd
=
FileLoader
(
EXAMPLES_DIR
/
"
lambdarank
"
,
"
rank
"
)
X_train
,
y_train
,
_
=
fd
.
load_dataset
(
'
.train
'
,
is_sparse
=
True
)
X_train
,
y_train
,
_
=
fd
.
load_dataset
(
"
.train
"
,
is_sparse
=
True
)
X_test
,
_
,
X_test_fn
=
fd
.
load_dataset
(
'
.test
'
,
is_sparse
=
True
)
X_test
,
_
,
X_test_fn
=
fd
.
load_dataset
(
"
.test
"
,
is_sparse
=
True
)
group_train
=
fd
.
load_field
(
'
.train.query
'
)
group_train
=
fd
.
load_field
(
"
.train.query
"
)
lgb_train
=
lgb
.
Dataset
(
X_train
,
y_train
,
group
=
group_train
)
lgb_train
=
lgb
.
Dataset
(
X_train
,
y_train
,
group
=
group_train
)
params
=
dict
(
fd
.
params
)
params
=
dict
(
fd
.
params
)
params
[
'
force_col_wise
'
]
=
True
params
[
"
force_col_wise
"
]
=
True
gbm
=
lgb
.
LGBMRanker
(
**
params
)
gbm
=
lgb
.
LGBMRanker
(
**
params
)
gbm
.
fit
(
X_train
,
y_train
,
group
=
group_train
)
gbm
.
fit
(
X_train
,
y_train
,
group
=
group_train
)
sk_pred
=
gbm
.
predict
(
X_test
)
sk_pred
=
gbm
.
predict
(
X_test
)
fd
.
train_predict_check
(
lgb_train
,
X_test
,
X_test_fn
,
sk_pred
)
fd
.
train_predict_check
(
lgb_train
,
X_test
,
X_test_fn
,
sk_pred
)
fd
.
file_load_check
(
lgb_train
,
'
.train
'
)
fd
.
file_load_check
(
lgb_train
,
"
.train
"
)
def
test_xendcg
():
def
test_xendcg
():
fd
=
FileLoader
(
EXAMPLES_DIR
/
'
xendcg
'
,
'
rank
'
)
fd
=
FileLoader
(
EXAMPLES_DIR
/
"
xendcg
"
,
"
rank
"
)
X_train
,
y_train
,
_
=
fd
.
load_dataset
(
'
.train
'
,
is_sparse
=
True
)
X_train
,
y_train
,
_
=
fd
.
load_dataset
(
"
.train
"
,
is_sparse
=
True
)
X_test
,
_
,
X_test_fn
=
fd
.
load_dataset
(
'
.test
'
,
is_sparse
=
True
)
X_test
,
_
,
X_test_fn
=
fd
.
load_dataset
(
"
.test
"
,
is_sparse
=
True
)
group_train
=
fd
.
load_field
(
'
.train.query
'
)
group_train
=
fd
.
load_field
(
"
.train.query
"
)
lgb_train
=
lgb
.
Dataset
(
X_train
,
y_train
,
group
=
group_train
)
lgb_train
=
lgb
.
Dataset
(
X_train
,
y_train
,
group
=
group_train
)
gbm
=
lgb
.
LGBMRanker
(
**
fd
.
params
)
gbm
=
lgb
.
LGBMRanker
(
**
fd
.
params
)
gbm
.
fit
(
X_train
,
y_train
,
group
=
group_train
)
gbm
.
fit
(
X_train
,
y_train
,
group
=
group_train
)
sk_pred
=
gbm
.
predict
(
X_test
)
sk_pred
=
gbm
.
predict
(
X_test
)
fd
.
train_predict_check
(
lgb_train
,
X_test
,
X_test_fn
,
sk_pred
)
fd
.
train_predict_check
(
lgb_train
,
X_test
,
X_test_fn
,
sk_pred
)
fd
.
file_load_check
(
lgb_train
,
'
.train
'
)
fd
.
file_load_check
(
lgb_train
,
"
.train
"
)
tests/python_package_test/test_dask.py
View file @
1b792e71
...
@@ -17,12 +17,12 @@ import lightgbm as lgb
...
@@ -17,12 +17,12 @@ import lightgbm as lgb
from
.utils
import
sklearn_multiclass_custom_objective
from
.utils
import
sklearn_multiclass_custom_objective
if
not
platform
.
startswith
(
'
linux
'
):
if
not
platform
.
startswith
(
"
linux
"
):
pytest
.
skip
(
'
lightgbm.dask is currently supported in Linux environments
'
,
allow_module_level
=
True
)
pytest
.
skip
(
"
lightgbm.dask is currently supported in Linux environments
"
,
allow_module_level
=
True
)
if
machine
()
!=
'
x86_64
'
:
if
machine
()
!=
"
x86_64
"
:
pytest
.
skip
(
'
lightgbm.dask tests are currently skipped on some architectures like arm64
'
,
allow_module_level
=
True
)
pytest
.
skip
(
"
lightgbm.dask tests are currently skipped on some architectures like arm64
"
,
allow_module_level
=
True
)
if
not
lgb
.
compat
.
DASK_INSTALLED
:
if
not
lgb
.
compat
.
DASK_INSTALLED
:
pytest
.
skip
(
'
Dask is not installed
'
,
allow_module_level
=
True
)
pytest
.
skip
(
"
Dask is not installed
"
,
allow_module_level
=
True
)
import
dask.array
as
da
import
dask.array
as
da
import
dask.dataframe
as
dd
import
dask.dataframe
as
dd
...
@@ -37,46 +37,46 @@ from sklearn.datasets import make_blobs, make_regression
...
@@ -37,46 +37,46 @@ from sklearn.datasets import make_blobs, make_regression
from
.utils
import
make_ranking
,
pickle_obj
,
unpickle_obj
from
.utils
import
make_ranking
,
pickle_obj
,
unpickle_obj
tasks
=
[
'
binary-classification
'
,
'
multiclass-classification
'
,
'
regression
'
,
'
ranking
'
]
tasks
=
[
"
binary-classification
"
,
"
multiclass-classification
"
,
"
regression
"
,
"
ranking
"
]
distributed_training_algorithms
=
[
'
data
'
,
'
voting
'
]
distributed_training_algorithms
=
[
"
data
"
,
"
voting
"
]
data_output
=
[
'
array
'
,
'
scipy_csr_matrix
'
,
'
dataframe
'
,
'
dataframe-with-categorical
'
]
data_output
=
[
"
array
"
,
"
scipy_csr_matrix
"
,
"
dataframe
"
,
"
dataframe-with-categorical
"
]
boosting_types
=
[
'
gbdt
'
,
'
dart
'
,
'
goss
'
,
'
rf
'
]
boosting_types
=
[
"
gbdt
"
,
"
dart
"
,
"
goss
"
,
"
rf
"
]
group_sizes
=
[
5
,
5
,
5
,
10
,
10
,
10
,
20
,
20
,
20
,
50
,
50
]
group_sizes
=
[
5
,
5
,
5
,
10
,
10
,
10
,
20
,
20
,
20
,
50
,
50
]
task_to_dask_factory
=
{
task_to_dask_factory
=
{
'
regression
'
:
lgb
.
DaskLGBMRegressor
,
"
regression
"
:
lgb
.
DaskLGBMRegressor
,
'
binary-classification
'
:
lgb
.
DaskLGBMClassifier
,
"
binary-classification
"
:
lgb
.
DaskLGBMClassifier
,
'
multiclass-classification
'
:
lgb
.
DaskLGBMClassifier
,
"
multiclass-classification
"
:
lgb
.
DaskLGBMClassifier
,
'
ranking
'
:
lgb
.
DaskLGBMRanker
"
ranking
"
:
lgb
.
DaskLGBMRanker
,
}
}
task_to_local_factory
=
{
task_to_local_factory
=
{
'
regression
'
:
lgb
.
LGBMRegressor
,
"
regression
"
:
lgb
.
LGBMRegressor
,
'
binary-classification
'
:
lgb
.
LGBMClassifier
,
"
binary-classification
"
:
lgb
.
LGBMClassifier
,
'
multiclass-classification
'
:
lgb
.
LGBMClassifier
,
"
multiclass-classification
"
:
lgb
.
LGBMClassifier
,
'
ranking
'
:
lgb
.
LGBMRanker
"
ranking
"
:
lgb
.
LGBMRanker
,
}
}
pytestmark
=
[
pytestmark
=
[
pytest
.
mark
.
skipif
(
getenv
(
'
TASK
'
,
''
)
==
'
mpi
'
,
reason
=
'
Fails to run with MPI interface
'
),
pytest
.
mark
.
skipif
(
getenv
(
"
TASK
"
,
""
)
==
"
mpi
"
,
reason
=
"
Fails to run with MPI interface
"
),
pytest
.
mark
.
skipif
(
getenv
(
'
TASK
'
,
''
)
==
'
gpu
'
,
reason
=
'
Fails to run with GPU interface
'
),
pytest
.
mark
.
skipif
(
getenv
(
"
TASK
"
,
""
)
==
"
gpu
"
,
reason
=
"
Fails to run with GPU interface
"
),
pytest
.
mark
.
skipif
(
getenv
(
'
TASK
'
,
''
)
==
'
cuda
'
,
reason
=
'
Fails to run with CUDA interface
'
)
pytest
.
mark
.
skipif
(
getenv
(
"
TASK
"
,
""
)
==
"
cuda
"
,
reason
=
"
Fails to run with CUDA interface
"
),
]
]
@
pytest
.
fixture
(
scope
=
'
module
'
)
@
pytest
.
fixture
(
scope
=
"
module
"
)
def
cluster
():
def
cluster
():
dask_cluster
=
LocalCluster
(
n_workers
=
2
,
threads_per_worker
=
2
,
dashboard_address
=
None
)
dask_cluster
=
LocalCluster
(
n_workers
=
2
,
threads_per_worker
=
2
,
dashboard_address
=
None
)
yield
dask_cluster
yield
dask_cluster
dask_cluster
.
close
()
dask_cluster
.
close
()
@
pytest
.
fixture
(
scope
=
'
module
'
)
@
pytest
.
fixture
(
scope
=
"
module
"
)
def
cluster2
():
def
cluster2
():
dask_cluster
=
LocalCluster
(
n_workers
=
2
,
threads_per_worker
=
2
,
dashboard_address
=
None
)
dask_cluster
=
LocalCluster
(
n_workers
=
2
,
threads_per_worker
=
2
,
dashboard_address
=
None
)
yield
dask_cluster
yield
dask_cluster
dask_cluster
.
close
()
dask_cluster
.
close
()
@
pytest
.
fixture
(
scope
=
'
module
'
)
@
pytest
.
fixture
(
scope
=
"
module
"
)
def
cluster_three_workers
():
def
cluster_three_workers
():
dask_cluster
=
LocalCluster
(
n_workers
=
3
,
threads_per_worker
=
1
,
dashboard_address
=
None
)
dask_cluster
=
LocalCluster
(
n_workers
=
3
,
threads_per_worker
=
1
,
dashboard_address
=
None
)
yield
dask_cluster
yield
dask_cluster
...
@@ -93,46 +93,43 @@ listen_port.port = 13000
...
@@ -93,46 +93,43 @@ listen_port.port = 13000
def
_get_workers_hostname
(
cluster
:
LocalCluster
)
->
str
:
def
_get_workers_hostname
(
cluster
:
LocalCluster
)
->
str
:
one_worker_address
=
next
(
iter
(
cluster
.
scheduler_info
[
'
workers
'
]))
one_worker_address
=
next
(
iter
(
cluster
.
scheduler_info
[
"
workers
"
]))
return
urlparse
(
one_worker_address
).
hostname
return
urlparse
(
one_worker_address
).
hostname
def
_create_ranking_data
(
n_samples
=
100
,
output
=
'
array
'
,
chunk_size
=
50
,
**
kwargs
):
def
_create_ranking_data
(
n_samples
=
100
,
output
=
"
array
"
,
chunk_size
=
50
,
**
kwargs
):
X
,
y
,
g
=
make_ranking
(
n_samples
=
n_samples
,
random_state
=
42
,
**
kwargs
)
X
,
y
,
g
=
make_ranking
(
n_samples
=
n_samples
,
random_state
=
42
,
**
kwargs
)
rnd
=
np
.
random
.
RandomState
(
42
)
rnd
=
np
.
random
.
RandomState
(
42
)
w
=
rnd
.
rand
(
X
.
shape
[
0
])
*
0.01
w
=
rnd
.
rand
(
X
.
shape
[
0
])
*
0.01
g_rle
=
np
.
array
([
len
(
list
(
grp
))
for
_
,
grp
in
groupby
(
g
)])
g_rle
=
np
.
array
([
len
(
list
(
grp
))
for
_
,
grp
in
groupby
(
g
)])
if
output
.
startswith
(
'
dataframe
'
):
if
output
.
startswith
(
"
dataframe
"
):
# add target, weight, and group to DataFrame so that partitions abide by group boundaries.
# add target, weight, and group to DataFrame so that partitions abide by group boundaries.
X_df
=
pd
.
DataFrame
(
X
,
columns
=
[
f
'
feature_
{
i
}
'
for
i
in
range
(
X
.
shape
[
1
])])
X_df
=
pd
.
DataFrame
(
X
,
columns
=
[
f
"
feature_
{
i
}
"
for
i
in
range
(
X
.
shape
[
1
])])
if
output
==
'
dataframe-with-categorical
'
:
if
output
==
"
dataframe-with-categorical
"
:
for
i
in
range
(
5
):
for
i
in
range
(
5
):
col_name
=
f
"cat_col
{
i
}
"
col_name
=
f
"cat_col
{
i
}
"
cat_values
=
rnd
.
choice
([
'a'
,
'b'
],
X
.
shape
[
0
])
cat_values
=
rnd
.
choice
([
"a"
,
"b"
],
X
.
shape
[
0
])
cat_series
=
pd
.
Series
(
cat_series
=
pd
.
Series
(
cat_values
,
dtype
=
"category"
)
cat_values
,
dtype
=
'category'
)
X_df
[
col_name
]
=
cat_series
X_df
[
col_name
]
=
cat_series
X
=
X_df
.
copy
()
X
=
X_df
.
copy
()
X_df
=
X_df
.
assign
(
y
=
y
,
g
=
g
,
w
=
w
)
X_df
=
X_df
.
assign
(
y
=
y
,
g
=
g
,
w
=
w
)
# set_index ensures partitions are based on group id.
# set_index ensures partitions are based on group id.
# See https://stackoverflow.com/questions/49532824/dask-dataframe-split-partitions-based-on-a-column-or-function.
# See https://stackoverflow.com/questions/49532824/dask-dataframe-split-partitions-based-on-a-column-or-function.
X_df
.
set_index
(
'g'
,
inplace
=
True
)
X_df
.
set_index
(
"g"
,
inplace
=
True
)
dX
=
dd
.
from_pandas
(
X_df
,
chunksize
=
chunk_size
)
dX
=
dd
.
from_pandas
(
X_df
,
chunksize
=
chunk_size
)
# separate target, weight from features.
# separate target, weight from features.
dy
=
dX
[
'y'
]
dy
=
dX
[
"y"
]
dw
=
dX
[
'w'
]
dw
=
dX
[
"w"
]
dX
=
dX
.
drop
(
columns
=
[
'y'
,
'w'
])
dX
=
dX
.
drop
(
columns
=
[
"y"
,
"w"
])
dg
=
dX
.
index
.
to_series
()
dg
=
dX
.
index
.
to_series
()
# encode group identifiers into run-length encoding, the format LightGBMRanker is expecting
# encode group identifiers into run-length encoding, the format LightGBMRanker is expecting
# so that within each partition, sum(g) = n_samples.
# so that within each partition, sum(g) = n_samples.
dg
=
dg
.
map_partitions
(
lambda
p
:
p
.
groupby
(
'g'
,
sort
=
False
).
apply
(
lambda
z
:
z
.
shape
[
0
]))
dg
=
dg
.
map_partitions
(
lambda
p
:
p
.
groupby
(
"g"
,
sort
=
False
).
apply
(
lambda
z
:
z
.
shape
[
0
]))
elif
output
==
'
array
'
:
elif
output
==
"
array
"
:
# ranking arrays: one chunk per group. Each chunk must include all columns.
# ranking arrays: one chunk per group. Each chunk must include all columns.
p
=
X
.
shape
[
1
]
p
=
X
.
shape
[
1
]
dX
,
dy
,
dw
,
dg
=
[],
[],
[],
[]
dX
,
dy
,
dw
,
dg
=
[],
[],
[],
[]
...
@@ -148,71 +145,63 @@ def _create_ranking_data(n_samples=100, output='array', chunk_size=50, **kwargs)
...
@@ -148,71 +145,63 @@ def _create_ranking_data(n_samples=100, output='array', chunk_size=50, **kwargs)
dw
=
da
.
concatenate
(
dw
,
axis
=
0
)
dw
=
da
.
concatenate
(
dw
,
axis
=
0
)
dg
=
da
.
concatenate
(
dg
,
axis
=
0
)
dg
=
da
.
concatenate
(
dg
,
axis
=
0
)
else
:
else
:
raise
ValueError
(
'
Ranking data creation only supported for Dask arrays and dataframes
'
)
raise
ValueError
(
"
Ranking data creation only supported for Dask arrays and dataframes
"
)
return
X
,
y
,
w
,
g_rle
,
dX
,
dy
,
dw
,
dg
return
X
,
y
,
w
,
g_rle
,
dX
,
dy
,
dw
,
dg
def
_create_data
(
objective
,
n_samples
=
1_000
,
output
=
'
array
'
,
chunk_size
=
500
,
**
kwargs
):
def
_create_data
(
objective
,
n_samples
=
1_000
,
output
=
"
array
"
,
chunk_size
=
500
,
**
kwargs
):
if
objective
.
endswith
(
'
classification
'
):
if
objective
.
endswith
(
"
classification
"
):
if
objective
==
'
binary-classification
'
:
if
objective
==
"
binary-classification
"
:
centers
=
[[
-
4
,
-
4
],
[
4
,
4
]]
centers
=
[[
-
4
,
-
4
],
[
4
,
4
]]
elif
objective
==
'
multiclass-classification
'
:
elif
objective
==
"
multiclass-classification
"
:
centers
=
[[
-
4
,
-
4
],
[
4
,
4
],
[
-
4
,
4
]]
centers
=
[[
-
4
,
-
4
],
[
4
,
4
],
[
-
4
,
4
]]
else
:
else
:
raise
ValueError
(
f
"Unknown classification task '
{
objective
}
'"
)
raise
ValueError
(
f
"Unknown classification task '
{
objective
}
'"
)
X
,
y
=
make_blobs
(
n_samples
=
n_samples
,
centers
=
centers
,
random_state
=
42
)
X
,
y
=
make_blobs
(
n_samples
=
n_samples
,
centers
=
centers
,
random_state
=
42
)
elif
objective
==
'
regression
'
:
elif
objective
==
"
regression
"
:
X
,
y
=
make_regression
(
n_samples
=
n_samples
,
n_features
=
4
,
n_informative
=
2
,
random_state
=
42
)
X
,
y
=
make_regression
(
n_samples
=
n_samples
,
n_features
=
4
,
n_informative
=
2
,
random_state
=
42
)
elif
objective
==
'ranking'
:
elif
objective
==
"ranking"
:
return
_create_ranking_data
(
return
_create_ranking_data
(
n_samples
=
n_samples
,
output
=
output
,
chunk_size
=
chunk_size
,
**
kwargs
)
n_samples
=
n_samples
,
output
=
output
,
chunk_size
=
chunk_size
,
**
kwargs
)
else
:
else
:
raise
ValueError
(
f
"Unknown objective '
{
objective
}
'"
)
raise
ValueError
(
f
"Unknown objective '
{
objective
}
'"
)
rnd
=
np
.
random
.
RandomState
(
42
)
rnd
=
np
.
random
.
RandomState
(
42
)
weights
=
rnd
.
random
(
X
.
shape
[
0
])
*
0.01
weights
=
rnd
.
random
(
X
.
shape
[
0
])
*
0.01
if
output
==
'
array
'
:
if
output
==
"
array
"
:
dX
=
da
.
from_array
(
X
,
(
chunk_size
,
X
.
shape
[
1
]))
dX
=
da
.
from_array
(
X
,
(
chunk_size
,
X
.
shape
[
1
]))
dy
=
da
.
from_array
(
y
,
chunk_size
)
dy
=
da
.
from_array
(
y
,
chunk_size
)
dw
=
da
.
from_array
(
weights
,
chunk_size
)
dw
=
da
.
from_array
(
weights
,
chunk_size
)
elif
output
.
startswith
(
'
dataframe
'
):
elif
output
.
startswith
(
"
dataframe
"
):
X_df
=
pd
.
DataFrame
(
X
,
columns
=
[
f
'
feature_
{
i
}
'
for
i
in
range
(
X
.
shape
[
1
])])
X_df
=
pd
.
DataFrame
(
X
,
columns
=
[
f
"
feature_
{
i
}
"
for
i
in
range
(
X
.
shape
[
1
])])
if
output
==
'
dataframe-with-categorical
'
:
if
output
==
"
dataframe-with-categorical
"
:
num_cat_cols
=
2
num_cat_cols
=
2
for
i
in
range
(
num_cat_cols
):
for
i
in
range
(
num_cat_cols
):
col_name
=
f
"cat_col
{
i
}
"
col_name
=
f
"cat_col
{
i
}
"
cat_values
=
rnd
.
choice
([
'a'
,
'b'
],
X
.
shape
[
0
])
cat_values
=
rnd
.
choice
([
"a"
,
"b"
],
X
.
shape
[
0
])
cat_series
=
pd
.
Series
(
cat_series
=
pd
.
Series
(
cat_values
,
dtype
=
"category"
)
cat_values
,
dtype
=
'category'
)
X_df
[
col_name
]
=
cat_series
X_df
[
col_name
]
=
cat_series
X
=
np
.
hstack
((
X
,
cat_series
.
cat
.
codes
.
values
.
reshape
(
-
1
,
1
)))
X
=
np
.
hstack
((
X
,
cat_series
.
cat
.
codes
.
values
.
reshape
(
-
1
,
1
)))
# make one categorical feature relevant to the target
# make one categorical feature relevant to the target
cat_col_is_a
=
X_df
[
'
cat_col0
'
]
==
'a'
cat_col_is_a
=
X_df
[
"
cat_col0
"
]
==
"a"
if
objective
==
'
regression
'
:
if
objective
==
"
regression
"
:
y
=
np
.
where
(
cat_col_is_a
,
y
,
2
*
y
)
y
=
np
.
where
(
cat_col_is_a
,
y
,
2
*
y
)
elif
objective
==
'
binary-classification
'
:
elif
objective
==
"
binary-classification
"
:
y
=
np
.
where
(
cat_col_is_a
,
y
,
1
-
y
)
y
=
np
.
where
(
cat_col_is_a
,
y
,
1
-
y
)
elif
objective
==
'
multiclass-classification
'
:
elif
objective
==
"
multiclass-classification
"
:
n_classes
=
3
n_classes
=
3
y
=
np
.
where
(
cat_col_is_a
,
y
,
(
1
+
y
)
%
n_classes
)
y
=
np
.
where
(
cat_col_is_a
,
y
,
(
1
+
y
)
%
n_classes
)
y_df
=
pd
.
Series
(
y
,
name
=
'
target
'
)
y_df
=
pd
.
Series
(
y
,
name
=
"
target
"
)
dX
=
dd
.
from_pandas
(
X_df
,
chunksize
=
chunk_size
)
dX
=
dd
.
from_pandas
(
X_df
,
chunksize
=
chunk_size
)
dy
=
dd
.
from_pandas
(
y_df
,
chunksize
=
chunk_size
)
dy
=
dd
.
from_pandas
(
y_df
,
chunksize
=
chunk_size
)
dw
=
dd
.
from_array
(
weights
,
chunksize
=
chunk_size
)
dw
=
dd
.
from_array
(
weights
,
chunksize
=
chunk_size
)
elif
output
==
'
scipy_csr_matrix
'
:
elif
output
==
"
scipy_csr_matrix
"
:
dX
=
da
.
from_array
(
X
,
chunks
=
(
chunk_size
,
X
.
shape
[
1
])).
map_blocks
(
csr_matrix
)
dX
=
da
.
from_array
(
X
,
chunks
=
(
chunk_size
,
X
.
shape
[
1
])).
map_blocks
(
csr_matrix
)
dy
=
da
.
from_array
(
y
,
chunks
=
chunk_size
)
dy
=
da
.
from_array
(
y
,
chunks
=
chunk_size
)
dw
=
da
.
from_array
(
weights
,
chunk_size
)
dw
=
da
.
from_array
(
weights
,
chunk_size
)
X
=
csr_matrix
(
X
)
X
=
csr_matrix
(
X
)
elif
output
==
'
scipy_csc_matrix
'
:
elif
output
==
"
scipy_csc_matrix
"
:
dX
=
da
.
from_array
(
X
,
chunks
=
(
chunk_size
,
X
.
shape
[
1
])).
map_blocks
(
csc_matrix
)
dX
=
da
.
from_array
(
X
,
chunks
=
(
chunk_size
,
X
.
shape
[
1
])).
map_blocks
(
csc_matrix
)
dy
=
da
.
from_array
(
y
,
chunks
=
chunk_size
)
dy
=
da
.
from_array
(
y
,
chunks
=
chunk_size
)
dw
=
da
.
from_array
(
weights
,
chunk_size
)
dw
=
da
.
from_array
(
weights
,
chunk_size
)
...
@@ -234,7 +223,7 @@ def _accuracy_score(dy_true, dy_pred):
...
@@ -234,7 +223,7 @@ def _accuracy_score(dy_true, dy_pred):
def
_constant_metric
(
y_true
,
y_pred
):
def
_constant_metric
(
y_true
,
y_pred
):
metric_name
=
'
constant_metric
'
metric_name
=
"
constant_metric
"
value
=
0.708
value
=
0.708
is_higher_better
=
False
is_higher_better
=
False
return
metric_name
,
value
,
is_higher_better
return
metric_name
,
value
,
is_higher_better
...
@@ -253,46 +242,32 @@ def _objective_logistic_regression(y_true, y_pred):
...
@@ -253,46 +242,32 @@ def _objective_logistic_regression(y_true, y_pred):
return
grad
,
hess
return
grad
,
hess
@
pytest
.
mark
.
parametrize
(
'
output
'
,
data_output
)
@
pytest
.
mark
.
parametrize
(
"
output
"
,
data_output
)
@
pytest
.
mark
.
parametrize
(
'
task
'
,
[
'
binary-classification
'
,
'
multiclass-classification
'
])
@
pytest
.
mark
.
parametrize
(
"
task
"
,
[
"
binary-classification
"
,
"
multiclass-classification
"
])
@
pytest
.
mark
.
parametrize
(
'
boosting_type
'
,
boosting_types
)
@
pytest
.
mark
.
parametrize
(
"
boosting_type
"
,
boosting_types
)
@
pytest
.
mark
.
parametrize
(
'
tree_learner
'
,
distributed_training_algorithms
)
@
pytest
.
mark
.
parametrize
(
"
tree_learner
"
,
distributed_training_algorithms
)
def
test_classifier
(
output
,
task
,
boosting_type
,
tree_learner
,
cluster
):
def
test_classifier
(
output
,
task
,
boosting_type
,
tree_learner
,
cluster
):
with
Client
(
cluster
)
as
client
:
with
Client
(
cluster
)
as
client
:
X
,
y
,
w
,
_
,
dX
,
dy
,
dw
,
_
=
_create_data
(
X
,
y
,
w
,
_
,
dX
,
dy
,
dw
,
_
=
_create_data
(
objective
=
task
,
output
=
output
)
objective
=
task
,
output
=
output
params
=
{
"boosting_type"
:
boosting_type
,
"tree_learner"
:
tree_learner
,
"n_estimators"
:
50
,
"num_leaves"
:
31
}
)
if
boosting_type
==
"rf"
:
params
.
update
(
{
"bagging_freq"
:
1
,
"bagging_fraction"
:
0.9
,
}
)
elif
boosting_type
==
"goss"
:
params
[
"top_rate"
]
=
0.5
params
=
{
dask_classifier
=
lgb
.
DaskLGBMClassifier
(
client
=
client
,
time_out
=
5
,
**
params
)
"boosting_type"
:
boosting_type
,
"tree_learner"
:
tree_learner
,
"n_estimators"
:
50
,
"num_leaves"
:
31
}
if
boosting_type
==
'rf'
:
params
.
update
({
'bagging_freq'
:
1
,
'bagging_fraction'
:
0.9
,
})
elif
boosting_type
==
'goss'
:
params
[
'top_rate'
]
=
0.5
dask_classifier
=
lgb
.
DaskLGBMClassifier
(
client
=
client
,
time_out
=
5
,
**
params
)
dask_classifier
=
dask_classifier
.
fit
(
dX
,
dy
,
sample_weight
=
dw
)
dask_classifier
=
dask_classifier
.
fit
(
dX
,
dy
,
sample_weight
=
dw
)
p1
=
dask_classifier
.
predict
(
dX
)
p1
=
dask_classifier
.
predict
(
dX
)
p1_raw
=
dask_classifier
.
predict
(
dX
,
raw_score
=
True
).
compute
()
p1_raw
=
dask_classifier
.
predict
(
dX
,
raw_score
=
True
).
compute
()
p1_first_iter_raw
=
dask_classifier
.
predict
(
dX
,
start_iteration
=
0
,
num_iteration
=
1
,
raw_score
=
True
).
compute
()
p1_first_iter_raw
=
dask_classifier
.
predict
(
dX
,
start_iteration
=
0
,
num_iteration
=
1
,
raw_score
=
True
).
compute
()
p1_early_stop_raw
=
dask_classifier
.
predict
(
p1_early_stop_raw
=
dask_classifier
.
predict
(
dX
,
dX
,
pred_early_stop
=
True
,
pred_early_stop_margin
=
1.0
,
pred_early_stop_freq
=
2
,
raw_score
=
True
pred_early_stop
=
True
,
pred_early_stop_margin
=
1.0
,
pred_early_stop_freq
=
2
,
raw_score
=
True
).
compute
()
).
compute
()
p1_proba
=
dask_classifier
.
predict_proba
(
dX
).
compute
()
p1_proba
=
dask_classifier
.
predict_proba
(
dX
).
compute
()
p1_pred_leaf
=
dask_classifier
.
predict
(
dX
,
pred_leaf
=
True
)
p1_pred_leaf
=
dask_classifier
.
predict
(
dX
,
pred_leaf
=
True
)
...
@@ -306,7 +281,7 @@ def test_classifier(output, task, boosting_type, tree_learner, cluster):
...
@@ -306,7 +281,7 @@ def test_classifier(output, task, boosting_type, tree_learner, cluster):
p2_proba
=
local_classifier
.
predict_proba
(
X
)
p2_proba
=
local_classifier
.
predict_proba
(
X
)
s2
=
local_classifier
.
score
(
X
,
y
)
s2
=
local_classifier
.
score
(
X
,
y
)
if
boosting_type
==
'
rf
'
:
if
boosting_type
==
"
rf
"
:
# https://github.com/microsoft/LightGBM/issues/4118
# https://github.com/microsoft/LightGBM/issues/4118
assert_eq
(
s1
,
s2
,
atol
=
0.01
)
assert_eq
(
s1
,
s2
,
atol
=
0.01
)
assert_eq
(
p1_proba
,
p2_proba
,
atol
=
0.8
)
assert_eq
(
p1_proba
,
p2_proba
,
atol
=
0.8
)
...
@@ -329,47 +304,30 @@ def test_classifier(output, task, boosting_type, tree_learner, cluster):
...
@@ -329,47 +304,30 @@ def test_classifier(output, task, boosting_type, tree_learner, cluster):
# pref_leaf values should have the right shape
# pref_leaf values should have the right shape
# and values that look like valid tree nodes
# and values that look like valid tree nodes
pred_leaf_vals
=
p1_pred_leaf
.
compute
()
pred_leaf_vals
=
p1_pred_leaf
.
compute
()
assert
pred_leaf_vals
.
shape
==
(
assert
pred_leaf_vals
.
shape
==
(
X
.
shape
[
0
],
dask_classifier
.
booster_
.
num_trees
())
X
.
shape
[
0
],
assert
np
.
max
(
pred_leaf_vals
)
<=
params
[
"num_leaves"
]
dask_classifier
.
booster_
.
num_trees
()
)
assert
np
.
max
(
pred_leaf_vals
)
<=
params
[
'num_leaves'
]
assert
np
.
min
(
pred_leaf_vals
)
>=
0
assert
np
.
min
(
pred_leaf_vals
)
>=
0
assert
len
(
np
.
unique
(
pred_leaf_vals
))
<=
params
[
'
num_leaves
'
]
assert
len
(
np
.
unique
(
pred_leaf_vals
))
<=
params
[
"
num_leaves
"
]
# be sure LightGBM actually used at least one categorical column,
# be sure LightGBM actually used at least one categorical column,
# and that it was correctly treated as a categorical feature
# and that it was correctly treated as a categorical feature
if
output
==
'dataframe-with-categorical'
:
if
output
==
"dataframe-with-categorical"
:
cat_cols
=
[
cat_cols
=
[
col
for
col
in
dX
.
columns
if
dX
.
dtypes
[
col
].
name
==
"category"
]
col
for
col
in
dX
.
columns
if
dX
.
dtypes
[
col
].
name
==
'category'
]
tree_df
=
dask_classifier
.
booster_
.
trees_to_dataframe
()
tree_df
=
dask_classifier
.
booster_
.
trees_to_dataframe
()
node_uses_cat_col
=
tree_df
[
'
split_feature
'
].
isin
(
cat_cols
)
node_uses_cat_col
=
tree_df
[
"
split_feature
"
].
isin
(
cat_cols
)
assert
node_uses_cat_col
.
sum
()
>
0
assert
node_uses_cat_col
.
sum
()
>
0
assert
tree_df
.
loc
[
node_uses_cat_col
,
"decision_type"
].
unique
()[
0
]
==
'
==
'
assert
tree_df
.
loc
[
node_uses_cat_col
,
"decision_type"
].
unique
()[
0
]
==
"
==
"
@
pytest
.
mark
.
parametrize
(
'
output
'
,
data_output
+
[
'
scipy_csc_matrix
'
])
@
pytest
.
mark
.
parametrize
(
"
output
"
,
data_output
+
[
"
scipy_csc_matrix
"
])
@
pytest
.
mark
.
parametrize
(
'
task
'
,
[
'
binary-classification
'
,
'
multiclass-classification
'
])
@
pytest
.
mark
.
parametrize
(
"
task
"
,
[
"
binary-classification
"
,
"
multiclass-classification
"
])
def
test_classifier_pred_contrib
(
output
,
task
,
cluster
):
def
test_classifier_pred_contrib
(
output
,
task
,
cluster
):
with
Client
(
cluster
)
as
client
:
with
Client
(
cluster
)
as
client
:
X
,
y
,
w
,
_
,
dX
,
dy
,
dw
,
_
=
_create_data
(
X
,
y
,
w
,
_
,
dX
,
dy
,
dw
,
_
=
_create_data
(
objective
=
task
,
output
=
output
)
objective
=
task
,
output
=
output
)
params
=
{
params
=
{
"n_estimators"
:
10
,
"num_leaves"
:
10
}
"n_estimators"
:
10
,
"num_leaves"
:
10
}
dask_classifier
=
lgb
.
DaskLGBMClassifier
(
dask_classifier
=
lgb
.
DaskLGBMClassifier
(
client
=
client
,
time_out
=
5
,
tree_learner
=
"data"
,
**
params
)
client
=
client
,
time_out
=
5
,
tree_learner
=
'data'
,
**
params
)
dask_classifier
=
dask_classifier
.
fit
(
dX
,
dy
,
sample_weight
=
dw
)
dask_classifier
=
dask_classifier
.
fit
(
dX
,
dy
,
sample_weight
=
dw
)
preds_with_contrib
=
dask_classifier
.
predict
(
dX
,
pred_contrib
=
True
)
preds_with_contrib
=
dask_classifier
.
predict
(
dX
,
pred_contrib
=
True
)
...
@@ -390,10 +348,10 @@ def test_classifier_pred_contrib(output, task, cluster):
...
@@ -390,10 +348,10 @@ def test_classifier_pred_contrib(output, task, cluster):
#
#
# since that case is so different than all other cases, check the relevant things here
# since that case is so different than all other cases, check the relevant things here
# and then return early
# and then return early
if
output
.
startswith
(
'
scipy
'
)
and
task
==
'
multiclass-classification
'
:
if
output
.
startswith
(
"
scipy
"
)
and
task
==
"
multiclass-classification
"
:
if
output
==
'
scipy_csr_matrix
'
:
if
output
==
"
scipy_csr_matrix
"
:
expected_type
=
csr_matrix
expected_type
=
csr_matrix
elif
output
==
'
scipy_csc_matrix
'
:
elif
output
==
"
scipy_csc_matrix
"
:
expected_type
=
csc_matrix
expected_type
=
csc_matrix
else
:
else
:
raise
ValueError
(
f
"Unrecognized output type:
{
output
}
"
)
raise
ValueError
(
f
"Unrecognized output type:
{
output
}
"
)
...
@@ -415,20 +373,17 @@ def test_classifier_pred_contrib(output, task, cluster):
...
@@ -415,20 +373,17 @@ def test_classifier_pred_contrib(output, task, cluster):
return
return
preds_with_contrib
=
preds_with_contrib
.
compute
()
preds_with_contrib
=
preds_with_contrib
.
compute
()
if
output
.
startswith
(
'
scipy
'
):
if
output
.
startswith
(
"
scipy
"
):
preds_with_contrib
=
preds_with_contrib
.
toarray
()
preds_with_contrib
=
preds_with_contrib
.
toarray
()
# be sure LightGBM actually used at least one categorical column,
# be sure LightGBM actually used at least one categorical column,
# and that it was correctly treated as a categorical feature
# and that it was correctly treated as a categorical feature
if
output
==
'dataframe-with-categorical'
:
if
output
==
"dataframe-with-categorical"
:
cat_cols
=
[
cat_cols
=
[
col
for
col
in
dX
.
columns
if
dX
.
dtypes
[
col
].
name
==
"category"
]
col
for
col
in
dX
.
columns
if
dX
.
dtypes
[
col
].
name
==
'category'
]
tree_df
=
dask_classifier
.
booster_
.
trees_to_dataframe
()
tree_df
=
dask_classifier
.
booster_
.
trees_to_dataframe
()
node_uses_cat_col
=
tree_df
[
'
split_feature
'
].
isin
(
cat_cols
)
node_uses_cat_col
=
tree_df
[
"
split_feature
"
].
isin
(
cat_cols
)
assert
node_uses_cat_col
.
sum
()
>
0
assert
node_uses_cat_col
.
sum
()
>
0
assert
tree_df
.
loc
[
node_uses_cat_col
,
"decision_type"
].
unique
()[
0
]
==
'
==
'
assert
tree_df
.
loc
[
node_uses_cat_col
,
"decision_type"
].
unique
()[
0
]
==
"
==
"
# * shape depends on whether it is binary or multiclass classification
# * shape depends on whether it is binary or multiclass classification
# * matrix for binary classification is of the form [feature_contrib, base_value],
# * matrix for binary classification is of the form [feature_contrib, base_value],
...
@@ -446,8 +401,8 @@ def test_classifier_pred_contrib(output, task, cluster):
...
@@ -446,8 +401,8 @@ def test_classifier_pred_contrib(output, task, cluster):
assert
len
(
np
.
unique
(
preds_with_contrib
[:,
base_value_col
])
==
1
)
assert
len
(
np
.
unique
(
preds_with_contrib
[:,
base_value_col
])
==
1
)
@
pytest
.
mark
.
parametrize
(
'
output
'
,
data_output
)
@
pytest
.
mark
.
parametrize
(
"
output
"
,
data_output
)
@
pytest
.
mark
.
parametrize
(
'
task
'
,
[
'
binary-classification
'
,
'
multiclass-classification
'
])
@
pytest
.
mark
.
parametrize
(
"
task
"
,
[
"
binary-classification
"
,
"
multiclass-classification
"
])
def
test_classifier_custom_objective
(
output
,
task
,
cluster
):
def
test_classifier_custom_objective
(
output
,
task
,
cluster
):
with
Client
(
cluster
)
as
client
:
with
Client
(
cluster
)
as
client
:
X
,
y
,
w
,
_
,
dX
,
dy
,
dw
,
_
=
_create_data
(
X
,
y
,
w
,
_
,
dX
,
dy
,
dw
,
_
=
_create_data
(
...
@@ -461,25 +416,19 @@ def test_classifier_custom_objective(output, task, cluster):
...
@@ -461,25 +416,19 @@ def test_classifier_custom_objective(output, task, cluster):
"verbose"
:
-
1
,
"verbose"
:
-
1
,
"seed"
:
708
,
"seed"
:
708
,
"deterministic"
:
True
,
"deterministic"
:
True
,
"force_col_wise"
:
True
"force_col_wise"
:
True
,
}
}
if
task
==
'binary-classification'
:
if
task
==
"binary-classification"
:
params
.
update
({
params
.
update
(
'objective'
:
_objective_logistic_regression
,
{
})
"objective"
:
_objective_logistic_regression
,
elif
task
==
'multiclass-classification'
:
}
params
.
update
({
)
'objective'
:
sklearn_multiclass_custom_objective
,
elif
task
==
"multiclass-classification"
:
'num_classes'
:
3
params
.
update
({
"objective"
:
sklearn_multiclass_custom_objective
,
"num_classes"
:
3
})
})
dask_classifier
=
lgb
.
DaskLGBMClassifier
(
client
=
client
,
time_out
=
5
,
tree_learner
=
"data"
,
**
params
)
dask_classifier
=
lgb
.
DaskLGBMClassifier
(
client
=
client
,
time_out
=
5
,
tree_learner
=
'data'
,
**
params
)
dask_classifier
=
dask_classifier
.
fit
(
dX
,
dy
,
sample_weight
=
dw
)
dask_classifier
=
dask_classifier
.
fit
(
dX
,
dy
,
sample_weight
=
dw
)
dask_classifier_local
=
dask_classifier
.
to_local
()
dask_classifier_local
=
dask_classifier
.
to_local
()
p1_raw
=
dask_classifier
.
predict
(
dX
,
raw_score
=
True
).
compute
()
p1_raw
=
dask_classifier
.
predict
(
dX
,
raw_score
=
True
).
compute
()
...
@@ -490,14 +439,14 @@ def test_classifier_custom_objective(output, task, cluster):
...
@@ -490,14 +439,14 @@ def test_classifier_custom_objective(output, task, cluster):
p2_raw
=
local_classifier
.
predict
(
X
,
raw_score
=
True
)
p2_raw
=
local_classifier
.
predict
(
X
,
raw_score
=
True
)
# with a custom objective, prediction result is a raw score instead of predicted class
# with a custom objective, prediction result is a raw score instead of predicted class
if
task
==
'
binary-classification
'
:
if
task
==
"
binary-classification
"
:
p1_proba
=
1.0
/
(
1.0
+
np
.
exp
(
-
p1_raw
))
p1_proba
=
1.0
/
(
1.0
+
np
.
exp
(
-
p1_raw
))
p1_class
=
(
p1_proba
>
0.5
).
astype
(
np
.
int64
)
p1_class
=
(
p1_proba
>
0.5
).
astype
(
np
.
int64
)
p1_proba_local
=
1.0
/
(
1.0
+
np
.
exp
(
-
p1_raw_local
))
p1_proba_local
=
1.0
/
(
1.0
+
np
.
exp
(
-
p1_raw_local
))
p1_class_local
=
(
p1_proba_local
>
0.5
).
astype
(
np
.
int64
)
p1_class_local
=
(
p1_proba_local
>
0.5
).
astype
(
np
.
int64
)
p2_proba
=
1.0
/
(
1.0
+
np
.
exp
(
-
p2_raw
))
p2_proba
=
1.0
/
(
1.0
+
np
.
exp
(
-
p2_raw
))
p2_class
=
(
p2_proba
>
0.5
).
astype
(
np
.
int64
)
p2_class
=
(
p2_proba
>
0.5
).
astype
(
np
.
int64
)
elif
task
==
'
multiclass-classification
'
:
elif
task
==
"
multiclass-classification
"
:
p1_proba
=
np
.
exp
(
p1_raw
)
/
np
.
sum
(
np
.
exp
(
p1_raw
),
axis
=
1
).
reshape
(
-
1
,
1
)
p1_proba
=
np
.
exp
(
p1_raw
)
/
np
.
sum
(
np
.
exp
(
p1_raw
),
axis
=
1
).
reshape
(
-
1
,
1
)
p1_class
=
p1_proba
.
argmax
(
axis
=
1
)
p1_class
=
p1_proba
.
argmax
(
axis
=
1
)
p1_proba_local
=
np
.
exp
(
p1_raw_local
)
/
np
.
sum
(
np
.
exp
(
p1_raw_local
),
axis
=
1
).
reshape
(
-
1
,
1
)
p1_proba_local
=
np
.
exp
(
p1_raw_local
)
/
np
.
sum
(
np
.
exp
(
p1_raw_local
),
axis
=
1
).
reshape
(
-
1
,
1
)
...
@@ -520,7 +469,7 @@ def test_classifier_custom_objective(output, task, cluster):
...
@@ -520,7 +469,7 @@ def test_classifier_custom_objective(output, task, cluster):
def
test_machines_to_worker_map_unparseable_host_names
():
def
test_machines_to_worker_map_unparseable_host_names
():
workers
=
{
'
0.0.0.1:80
'
:
{},
'
0.0.0.2:80
'
:
{}}
workers
=
{
"
0.0.0.1:80
"
:
{},
"
0.0.0.2:80
"
:
{}}
machines
=
"0.0.0.1:80,0.0.0.2:80"
machines
=
"0.0.0.1:80,0.0.0.2:80"
with
pytest
.
raises
(
ValueError
,
match
=
"Could not parse host name from worker address '0.0.0.1:80'"
):
with
pytest
.
raises
(
ValueError
,
match
=
"Could not parse host name from worker address '0.0.0.1:80'"
):
lgb
.
dask
.
_machines_to_worker_map
(
machines
=
machines
,
worker_addresses
=
workers
.
keys
())
lgb
.
dask
.
_machines_to_worker_map
(
machines
=
machines
,
worker_addresses
=
workers
.
keys
())
...
@@ -528,18 +477,13 @@ def test_machines_to_worker_map_unparseable_host_names():
...
@@ -528,18 +477,13 @@ def test_machines_to_worker_map_unparseable_host_names():
def
test_training_does_not_fail_on_port_conflicts
(
cluster
):
def
test_training_does_not_fail_on_port_conflicts
(
cluster
):
with
Client
(
cluster
)
as
client
:
with
Client
(
cluster
)
as
client
:
_
,
_
,
_
,
_
,
dX
,
dy
,
dw
,
_
=
_create_data
(
'
binary-classification
'
,
output
=
'
array
'
)
_
,
_
,
_
,
_
,
dX
,
dy
,
dw
,
_
=
_create_data
(
"
binary-classification
"
,
output
=
"
array
"
)
lightgbm_default_port
=
12400
lightgbm_default_port
=
12400
workers_hostname
=
_get_workers_hostname
(
cluster
)
workers_hostname
=
_get_workers_hostname
(
cluster
)
with
socket
.
socket
(
socket
.
AF_INET
,
socket
.
SOCK_STREAM
)
as
s
:
with
socket
.
socket
(
socket
.
AF_INET
,
socket
.
SOCK_STREAM
)
as
s
:
s
.
bind
((
workers_hostname
,
lightgbm_default_port
))
s
.
bind
((
workers_hostname
,
lightgbm_default_port
))
dask_classifier
=
lgb
.
DaskLGBMClassifier
(
dask_classifier
=
lgb
.
DaskLGBMClassifier
(
client
=
client
,
time_out
=
5
,
n_estimators
=
5
,
num_leaves
=
5
)
client
=
client
,
time_out
=
5
,
n_estimators
=
5
,
num_leaves
=
5
)
for
_
in
range
(
5
):
for
_
in
range
(
5
):
dask_classifier
.
fit
(
dask_classifier
.
fit
(
X
=
dX
,
X
=
dX
,
...
@@ -549,15 +493,12 @@ def test_training_does_not_fail_on_port_conflicts(cluster):
...
@@ -549,15 +493,12 @@ def test_training_does_not_fail_on_port_conflicts(cluster):
assert
dask_classifier
.
booster_
assert
dask_classifier
.
booster_
@
pytest
.
mark
.
parametrize
(
'
output
'
,
data_output
)
@
pytest
.
mark
.
parametrize
(
"
output
"
,
data_output
)
@
pytest
.
mark
.
parametrize
(
'
boosting_type
'
,
boosting_types
)
@
pytest
.
mark
.
parametrize
(
"
boosting_type
"
,
boosting_types
)
@
pytest
.
mark
.
parametrize
(
'
tree_learner
'
,
distributed_training_algorithms
)
@
pytest
.
mark
.
parametrize
(
"
tree_learner
"
,
distributed_training_algorithms
)
def
test_regressor
(
output
,
boosting_type
,
tree_learner
,
cluster
):
def
test_regressor
(
output
,
boosting_type
,
tree_learner
,
cluster
):
with
Client
(
cluster
)
as
client
:
with
Client
(
cluster
)
as
client
:
X
,
y
,
w
,
_
,
dX
,
dy
,
dw
,
_
=
_create_data
(
X
,
y
,
w
,
_
,
dX
,
dy
,
dw
,
_
=
_create_data
(
objective
=
"regression"
,
output
=
output
)
objective
=
'regression'
,
output
=
output
)
params
=
{
params
=
{
"boosting_type"
:
boosting_type
,
"boosting_type"
:
boosting_type
,
...
@@ -565,18 +506,15 @@ def test_regressor(output, boosting_type, tree_learner, cluster):
...
@@ -565,18 +506,15 @@ def test_regressor(output, boosting_type, tree_learner, cluster):
"num_leaves"
:
31
,
"num_leaves"
:
31
,
"n_estimators"
:
20
,
"n_estimators"
:
20
,
}
}
if
boosting_type
==
'rf'
:
if
boosting_type
==
"rf"
:
params
.
update
({
params
.
update
(
'bagging_freq'
:
1
,
{
'bagging_fraction'
:
0.9
,
"bagging_freq"
:
1
,
})
"bagging_fraction"
:
0.9
,
}
)
dask_regressor
=
lgb
.
DaskLGBMRegressor
(
dask_regressor
=
lgb
.
DaskLGBMRegressor
(
client
=
client
,
time_out
=
5
,
tree
=
tree_learner
,
**
params
)
client
=
client
,
time_out
=
5
,
tree
=
tree_learner
,
**
params
)
dask_regressor
=
dask_regressor
.
fit
(
dX
,
dy
,
sample_weight
=
dw
)
dask_regressor
=
dask_regressor
.
fit
(
dX
,
dy
,
sample_weight
=
dw
)
p1
=
dask_regressor
.
predict
(
dX
)
p1
=
dask_regressor
.
predict
(
dX
)
p1_pred_leaf
=
dask_regressor
.
predict
(
dX
,
pred_leaf
=
True
)
p1_pred_leaf
=
dask_regressor
.
predict
(
dX
,
pred_leaf
=
True
)
...
@@ -603,16 +541,13 @@ def test_regressor(output, boosting_type, tree_learner, cluster):
...
@@ -603,16 +541,13 @@ def test_regressor(output, boosting_type, tree_learner, cluster):
# pref_leaf values should have the right shape
# pref_leaf values should have the right shape
# and values that look like valid tree nodes
# and values that look like valid tree nodes
pred_leaf_vals
=
p1_pred_leaf
.
compute
()
pred_leaf_vals
=
p1_pred_leaf
.
compute
()
assert
pred_leaf_vals
.
shape
==
(
assert
pred_leaf_vals
.
shape
==
(
X
.
shape
[
0
],
dask_regressor
.
booster_
.
num_trees
())
X
.
shape
[
0
],
assert
np
.
max
(
pred_leaf_vals
)
<=
params
[
"num_leaves"
]
dask_regressor
.
booster_
.
num_trees
()
)
assert
np
.
max
(
pred_leaf_vals
)
<=
params
[
'num_leaves'
]
assert
np
.
min
(
pred_leaf_vals
)
>=
0
assert
np
.
min
(
pred_leaf_vals
)
>=
0
assert
len
(
np
.
unique
(
pred_leaf_vals
))
<=
params
[
'
num_leaves
'
]
assert
len
(
np
.
unique
(
pred_leaf_vals
))
<=
params
[
"
num_leaves
"
]
assert_eq
(
p1
,
y
,
rtol
=
0.5
,
atol
=
50.
)
assert_eq
(
p1
,
y
,
rtol
=
0.5
,
atol
=
50.
0
)
assert_eq
(
p2
,
y
,
rtol
=
0.5
,
atol
=
50.
)
assert_eq
(
p2
,
y
,
rtol
=
0.5
,
atol
=
50.
0
)
# extra predict() parameters should be passed through correctly
# extra predict() parameters should be passed through correctly
with
pytest
.
raises
(
AssertionError
):
with
pytest
.
raises
(
AssertionError
):
...
@@ -620,36 +555,22 @@ def test_regressor(output, boosting_type, tree_learner, cluster):
...
@@ -620,36 +555,22 @@ def test_regressor(output, boosting_type, tree_learner, cluster):
# be sure LightGBM actually used at least one categorical column,
# be sure LightGBM actually used at least one categorical column,
# and that it was correctly treated as a categorical feature
# and that it was correctly treated as a categorical feature
if
output
==
'dataframe-with-categorical'
:
if
output
==
"dataframe-with-categorical"
:
cat_cols
=
[
cat_cols
=
[
col
for
col
in
dX
.
columns
if
dX
.
dtypes
[
col
].
name
==
"category"
]
col
for
col
in
dX
.
columns
if
dX
.
dtypes
[
col
].
name
==
'category'
]
tree_df
=
dask_regressor
.
booster_
.
trees_to_dataframe
()
tree_df
=
dask_regressor
.
booster_
.
trees_to_dataframe
()
node_uses_cat_col
=
tree_df
[
'
split_feature
'
].
isin
(
cat_cols
)
node_uses_cat_col
=
tree_df
[
"
split_feature
"
].
isin
(
cat_cols
)
assert
node_uses_cat_col
.
sum
()
>
0
assert
node_uses_cat_col
.
sum
()
>
0
assert
tree_df
.
loc
[
node_uses_cat_col
,
"decision_type"
].
unique
()[
0
]
==
'
==
'
assert
tree_df
.
loc
[
node_uses_cat_col
,
"decision_type"
].
unique
()[
0
]
==
"
==
"
@
pytest
.
mark
.
parametrize
(
'
output
'
,
data_output
)
@
pytest
.
mark
.
parametrize
(
"
output
"
,
data_output
)
def
test_regressor_pred_contrib
(
output
,
cluster
):
def
test_regressor_pred_contrib
(
output
,
cluster
):
with
Client
(
cluster
)
as
client
:
with
Client
(
cluster
)
as
client
:
X
,
y
,
w
,
_
,
dX
,
dy
,
dw
,
_
=
_create_data
(
X
,
y
,
w
,
_
,
dX
,
dy
,
dw
,
_
=
_create_data
(
objective
=
"regression"
,
output
=
output
)
objective
=
'regression'
,
output
=
output
)
params
=
{
params
=
{
"n_estimators"
:
10
,
"num_leaves"
:
10
}
"n_estimators"
:
10
,
"num_leaves"
:
10
}
dask_regressor
=
lgb
.
DaskLGBMRegressor
(
dask_regressor
=
lgb
.
DaskLGBMRegressor
(
client
=
client
,
time_out
=
5
,
tree_learner
=
"data"
,
**
params
)
client
=
client
,
time_out
=
5
,
tree_learner
=
'data'
,
**
params
)
dask_regressor
=
dask_regressor
.
fit
(
dX
,
dy
,
sample_weight
=
dw
)
dask_regressor
=
dask_regressor
.
fit
(
dX
,
dy
,
sample_weight
=
dw
)
preds_with_contrib
=
dask_regressor
.
predict
(
dX
,
pred_contrib
=
True
).
compute
()
preds_with_contrib
=
dask_regressor
.
predict
(
dX
,
pred_contrib
=
True
).
compute
()
...
@@ -668,39 +589,23 @@ def test_regressor_pred_contrib(output, cluster):
...
@@ -668,39 +589,23 @@ def test_regressor_pred_contrib(output, cluster):
# be sure LightGBM actually used at least one categorical column,
# be sure LightGBM actually used at least one categorical column,
# and that it was correctly treated as a categorical feature
# and that it was correctly treated as a categorical feature
if
output
==
'dataframe-with-categorical'
:
if
output
==
"dataframe-with-categorical"
:
cat_cols
=
[
cat_cols
=
[
col
for
col
in
dX
.
columns
if
dX
.
dtypes
[
col
].
name
==
"category"
]
col
for
col
in
dX
.
columns
if
dX
.
dtypes
[
col
].
name
==
'category'
]
tree_df
=
dask_regressor
.
booster_
.
trees_to_dataframe
()
tree_df
=
dask_regressor
.
booster_
.
trees_to_dataframe
()
node_uses_cat_col
=
tree_df
[
'
split_feature
'
].
isin
(
cat_cols
)
node_uses_cat_col
=
tree_df
[
"
split_feature
"
].
isin
(
cat_cols
)
assert
node_uses_cat_col
.
sum
()
>
0
assert
node_uses_cat_col
.
sum
()
>
0
assert
tree_df
.
loc
[
node_uses_cat_col
,
"decision_type"
].
unique
()[
0
]
==
'
==
'
assert
tree_df
.
loc
[
node_uses_cat_col
,
"decision_type"
].
unique
()[
0
]
==
"
==
"
@
pytest
.
mark
.
parametrize
(
'
output
'
,
data_output
)
@
pytest
.
mark
.
parametrize
(
"
output
"
,
data_output
)
@
pytest
.
mark
.
parametrize
(
'
alpha
'
,
[.
1
,
.
5
,
.
9
])
@
pytest
.
mark
.
parametrize
(
"
alpha
"
,
[
0
.1
,
0
.5
,
0
.9
])
def
test_regressor_quantile
(
output
,
alpha
,
cluster
):
def
test_regressor_quantile
(
output
,
alpha
,
cluster
):
with
Client
(
cluster
)
as
client
:
with
Client
(
cluster
)
as
client
:
X
,
y
,
w
,
_
,
dX
,
dy
,
dw
,
_
=
_create_data
(
X
,
y
,
w
,
_
,
dX
,
dy
,
dw
,
_
=
_create_data
(
objective
=
"regression"
,
output
=
output
)
objective
=
'regression'
,
output
=
output
)
params
=
{
params
=
{
"objective"
:
"quantile"
,
"alpha"
:
alpha
,
"random_state"
:
42
,
"n_estimators"
:
10
,
"num_leaves"
:
10
}
"objective"
:
"quantile"
,
"alpha"
:
alpha
,
"random_state"
:
42
,
"n_estimators"
:
10
,
"num_leaves"
:
10
}
dask_regressor
=
lgb
.
DaskLGBMRegressor
(
dask_regressor
=
lgb
.
DaskLGBMRegressor
(
client
=
client
,
tree_learner_type
=
"data_parallel"
,
**
params
)
client
=
client
,
tree_learner_type
=
'data_parallel'
,
**
params
)
dask_regressor
=
dask_regressor
.
fit
(
dX
,
dy
,
sample_weight
=
dw
)
dask_regressor
=
dask_regressor
.
fit
(
dX
,
dy
,
sample_weight
=
dw
)
p1
=
dask_regressor
.
predict
(
dX
).
compute
()
p1
=
dask_regressor
.
predict
(
dX
).
compute
()
q1
=
np
.
count_nonzero
(
y
<
p1
)
/
y
.
shape
[
0
]
q1
=
np
.
count_nonzero
(
y
<
p1
)
/
y
.
shape
[
0
]
...
@@ -716,37 +621,22 @@ def test_regressor_quantile(output, alpha, cluster):
...
@@ -716,37 +621,22 @@ def test_regressor_quantile(output, alpha, cluster):
# be sure LightGBM actually used at least one categorical column,
# be sure LightGBM actually used at least one categorical column,
# and that it was correctly treated as a categorical feature
# and that it was correctly treated as a categorical feature
if
output
==
'dataframe-with-categorical'
:
if
output
==
"dataframe-with-categorical"
:
cat_cols
=
[
cat_cols
=
[
col
for
col
in
dX
.
columns
if
dX
.
dtypes
[
col
].
name
==
"category"
]
col
for
col
in
dX
.
columns
if
dX
.
dtypes
[
col
].
name
==
'category'
]
tree_df
=
dask_regressor
.
booster_
.
trees_to_dataframe
()
tree_df
=
dask_regressor
.
booster_
.
trees_to_dataframe
()
node_uses_cat_col
=
tree_df
[
'
split_feature
'
].
isin
(
cat_cols
)
node_uses_cat_col
=
tree_df
[
"
split_feature
"
].
isin
(
cat_cols
)
assert
node_uses_cat_col
.
sum
()
>
0
assert
node_uses_cat_col
.
sum
()
>
0
assert
tree_df
.
loc
[
node_uses_cat_col
,
"decision_type"
].
unique
()[
0
]
==
'
==
'
assert
tree_df
.
loc
[
node_uses_cat_col
,
"decision_type"
].
unique
()[
0
]
==
"
==
"
@
pytest
.
mark
.
parametrize
(
'
output
'
,
data_output
)
@
pytest
.
mark
.
parametrize
(
"
output
"
,
data_output
)
def
test_regressor_custom_objective
(
output
,
cluster
):
def
test_regressor_custom_objective
(
output
,
cluster
):
with
Client
(
cluster
)
as
client
:
with
Client
(
cluster
)
as
client
:
X
,
y
,
w
,
_
,
dX
,
dy
,
dw
,
_
=
_create_data
(
X
,
y
,
w
,
_
,
dX
,
dy
,
dw
,
_
=
_create_data
(
objective
=
"regression"
,
output
=
output
)
objective
=
'regression'
,
output
=
output
)
params
=
{
params
=
{
"n_estimators"
:
10
,
"num_leaves"
:
10
,
"objective"
:
_objective_least_squares
}
"n_estimators"
:
10
,
"num_leaves"
:
10
,
"objective"
:
_objective_least_squares
}
dask_regressor
=
lgb
.
DaskLGBMRegressor
(
dask_regressor
=
lgb
.
DaskLGBMRegressor
(
client
=
client
,
time_out
=
5
,
tree_learner
=
"data"
,
**
params
)
client
=
client
,
time_out
=
5
,
tree_learner
=
'data'
,
**
params
)
dask_regressor
=
dask_regressor
.
fit
(
dX
,
dy
,
sample_weight
=
dw
)
dask_regressor
=
dask_regressor
.
fit
(
dX
,
dy
,
sample_weight
=
dw
)
dask_regressor_local
=
dask_regressor
.
to_local
()
dask_regressor_local
=
dask_regressor
.
to_local
()
p1
=
dask_regressor
.
predict
(
dX
)
p1
=
dask_regressor
.
predict
(
dX
)
...
@@ -772,34 +662,26 @@ def test_regressor_custom_objective(output, cluster):
...
@@ -772,34 +662,26 @@ def test_regressor_custom_objective(output, cluster):
assert_eq
(
p1
,
p1_local
)
assert_eq
(
p1
,
p1_local
)
# predictions should be better than random
# predictions should be better than random
assert_precision
=
{
"rtol"
:
0.5
,
"atol"
:
50.
}
assert_precision
=
{
"rtol"
:
0.5
,
"atol"
:
50.
0
}
assert_eq
(
p1
,
y
,
**
assert_precision
)
assert_eq
(
p1
,
y
,
**
assert_precision
)
assert_eq
(
p2
,
y
,
**
assert_precision
)
assert_eq
(
p2
,
y
,
**
assert_precision
)
@
pytest
.
mark
.
parametrize
(
'
output
'
,
[
'
array
'
,
'
dataframe
'
,
'
dataframe-with-categorical
'
])
@
pytest
.
mark
.
parametrize
(
"
output
"
,
[
"
array
"
,
"
dataframe
"
,
"
dataframe-with-categorical
"
])
@
pytest
.
mark
.
parametrize
(
'
group
'
,
[
None
,
group_sizes
])
@
pytest
.
mark
.
parametrize
(
"
group
"
,
[
None
,
group_sizes
])
@
pytest
.
mark
.
parametrize
(
'
boosting_type
'
,
boosting_types
)
@
pytest
.
mark
.
parametrize
(
"
boosting_type
"
,
boosting_types
)
@
pytest
.
mark
.
parametrize
(
'
tree_learner
'
,
distributed_training_algorithms
)
@
pytest
.
mark
.
parametrize
(
"
tree_learner
"
,
distributed_training_algorithms
)
def
test_ranker
(
output
,
group
,
boosting_type
,
tree_learner
,
cluster
):
def
test_ranker
(
output
,
group
,
boosting_type
,
tree_learner
,
cluster
):
with
Client
(
cluster
)
as
client
:
with
Client
(
cluster
)
as
client
:
if
output
==
'
dataframe-with-categorical
'
:
if
output
==
"
dataframe-with-categorical
"
:
X
,
y
,
w
,
g
,
dX
,
dy
,
dw
,
dg
=
_create_data
(
X
,
y
,
w
,
g
,
dX
,
dy
,
dw
,
dg
=
_create_data
(
objective
=
'ranking'
,
objective
=
"ranking"
,
output
=
output
,
group
=
group
,
n_features
=
1
,
n_informative
=
1
output
=
output
,
group
=
group
,
n_features
=
1
,
n_informative
=
1
)
)
else
:
else
:
X
,
y
,
w
,
g
,
dX
,
dy
,
dw
,
dg
=
_create_data
(
X
,
y
,
w
,
g
,
dX
,
dy
,
dw
,
dg
=
_create_data
(
objective
=
"ranking"
,
output
=
output
,
group
=
group
)
objective
=
'ranking'
,
output
=
output
,
group
=
group
)
# rebalance small dask.Array dataset for better performance.
# rebalance small dask.Array dataset for better performance.
if
output
==
'
array
'
:
if
output
==
"
array
"
:
dX
=
dX
.
persist
()
dX
=
dX
.
persist
()
dy
=
dy
.
persist
()
dy
=
dy
.
persist
()
dw
=
dw
.
persist
()
dw
=
dw
.
persist
()
...
@@ -814,20 +696,17 @@ def test_ranker(output, group, boosting_type, tree_learner, cluster):
...
@@ -814,20 +696,17 @@ def test_ranker(output, group, boosting_type, tree_learner, cluster):
"random_state"
:
42
,
"random_state"
:
42
,
"n_estimators"
:
50
,
"n_estimators"
:
50
,
"num_leaves"
:
20
,
"num_leaves"
:
20
,
"min_child_samples"
:
1
"min_child_samples"
:
1
,
}
}
if
boosting_type
==
'rf'
:
if
boosting_type
==
"rf"
:
params
.
update
({
params
.
update
(
'bagging_freq'
:
1
,
{
'bagging_fraction'
:
0.9
,
"bagging_freq"
:
1
,
})
"bagging_fraction"
:
0.9
,
}
dask_ranker
=
lgb
.
DaskLGBMRanker
(
)
client
=
client
,
time_out
=
5
,
dask_ranker
=
lgb
.
DaskLGBMRanker
(
client
=
client
,
time_out
=
5
,
tree_learner_type
=
tree_learner
,
**
params
)
tree_learner_type
=
tree_learner
,
**
params
)
dask_ranker
=
dask_ranker
.
fit
(
dX
,
dy
,
sample_weight
=
dw
,
group
=
dg
)
dask_ranker
=
dask_ranker
.
fit
(
dX
,
dy
,
sample_weight
=
dw
,
group
=
dg
)
rnkvec_dask
=
dask_ranker
.
predict
(
dX
)
rnkvec_dask
=
dask_ranker
.
predict
(
dX
)
rnkvec_dask
=
rnkvec_dask
.
compute
()
rnkvec_dask
=
rnkvec_dask
.
compute
()
...
@@ -835,11 +714,7 @@ def test_ranker(output, group, boosting_type, tree_learner, cluster):
...
@@ -835,11 +714,7 @@ def test_ranker(output, group, boosting_type, tree_learner, cluster):
p1_raw
=
dask_ranker
.
predict
(
dX
,
raw_score
=
True
).
compute
()
p1_raw
=
dask_ranker
.
predict
(
dX
,
raw_score
=
True
).
compute
()
p1_first_iter_raw
=
dask_ranker
.
predict
(
dX
,
start_iteration
=
0
,
num_iteration
=
1
,
raw_score
=
True
).
compute
()
p1_first_iter_raw
=
dask_ranker
.
predict
(
dX
,
start_iteration
=
0
,
num_iteration
=
1
,
raw_score
=
True
).
compute
()
p1_early_stop_raw
=
dask_ranker
.
predict
(
p1_early_stop_raw
=
dask_ranker
.
predict
(
dX
,
dX
,
pred_early_stop
=
True
,
pred_early_stop_margin
=
1.0
,
pred_early_stop_freq
=
2
,
raw_score
=
True
pred_early_stop
=
True
,
pred_early_stop_margin
=
1.0
,
pred_early_stop_freq
=
2
,
raw_score
=
True
).
compute
()
).
compute
()
rnkvec_dask_local
=
dask_ranker
.
to_local
().
predict
(
X
)
rnkvec_dask_local
=
dask_ranker
.
to_local
().
predict
(
X
)
...
@@ -864,47 +739,33 @@ def test_ranker(output, group, boosting_type, tree_learner, cluster):
...
@@ -864,47 +739,33 @@ def test_ranker(output, group, boosting_type, tree_learner, cluster):
# pref_leaf values should have the right shape
# pref_leaf values should have the right shape
# and values that look like valid tree nodes
# and values that look like valid tree nodes
pred_leaf_vals
=
p1_pred_leaf
.
compute
()
pred_leaf_vals
=
p1_pred_leaf
.
compute
()
assert
pred_leaf_vals
.
shape
==
(
assert
pred_leaf_vals
.
shape
==
(
X
.
shape
[
0
],
dask_ranker
.
booster_
.
num_trees
())
X
.
shape
[
0
],
assert
np
.
max
(
pred_leaf_vals
)
<=
params
[
"num_leaves"
]
dask_ranker
.
booster_
.
num_trees
()
)
assert
np
.
max
(
pred_leaf_vals
)
<=
params
[
'num_leaves'
]
assert
np
.
min
(
pred_leaf_vals
)
>=
0
assert
np
.
min
(
pred_leaf_vals
)
>=
0
assert
len
(
np
.
unique
(
pred_leaf_vals
))
<=
params
[
'
num_leaves
'
]
assert
len
(
np
.
unique
(
pred_leaf_vals
))
<=
params
[
"
num_leaves
"
]
# be sure LightGBM actually used at least one categorical column,
# be sure LightGBM actually used at least one categorical column,
# and that it was correctly treated as a categorical feature
# and that it was correctly treated as a categorical feature
if
output
==
'dataframe-with-categorical'
:
if
output
==
"dataframe-with-categorical"
:
cat_cols
=
[
cat_cols
=
[
col
for
col
in
dX
.
columns
if
dX
.
dtypes
[
col
].
name
==
"category"
]
col
for
col
in
dX
.
columns
if
dX
.
dtypes
[
col
].
name
==
'category'
]
tree_df
=
dask_ranker
.
booster_
.
trees_to_dataframe
()
tree_df
=
dask_ranker
.
booster_
.
trees_to_dataframe
()
node_uses_cat_col
=
tree_df
[
'
split_feature
'
].
isin
(
cat_cols
)
node_uses_cat_col
=
tree_df
[
"
split_feature
"
].
isin
(
cat_cols
)
assert
node_uses_cat_col
.
sum
()
>
0
assert
node_uses_cat_col
.
sum
()
>
0
assert
tree_df
.
loc
[
node_uses_cat_col
,
"decision_type"
].
unique
()[
0
]
==
'
==
'
assert
tree_df
.
loc
[
node_uses_cat_col
,
"decision_type"
].
unique
()[
0
]
==
"
==
"
@
pytest
.
mark
.
parametrize
(
'
output
'
,
[
'
array
'
,
'
dataframe
'
,
'
dataframe-with-categorical
'
])
@
pytest
.
mark
.
parametrize
(
"
output
"
,
[
"
array
"
,
"
dataframe
"
,
"
dataframe-with-categorical
"
])
def
test_ranker_custom_objective
(
output
,
cluster
):
def
test_ranker_custom_objective
(
output
,
cluster
):
with
Client
(
cluster
)
as
client
:
with
Client
(
cluster
)
as
client
:
if
output
==
'
dataframe-with-categorical
'
:
if
output
==
"
dataframe-with-categorical
"
:
X
,
y
,
w
,
g
,
dX
,
dy
,
dw
,
dg
=
_create_data
(
X
,
y
,
w
,
g
,
dX
,
dy
,
dw
,
dg
=
_create_data
(
objective
=
'ranking'
,
objective
=
"ranking"
,
output
=
output
,
group
=
group_sizes
,
n_features
=
1
,
n_informative
=
1
output
=
output
,
group
=
group_sizes
,
n_features
=
1
,
n_informative
=
1
)
)
else
:
else
:
X
,
y
,
w
,
g
,
dX
,
dy
,
dw
,
dg
=
_create_data
(
X
,
y
,
w
,
g
,
dX
,
dy
,
dw
,
dg
=
_create_data
(
objective
=
"ranking"
,
output
=
output
,
group
=
group_sizes
)
objective
=
'ranking'
,
output
=
output
,
group
=
group_sizes
)
# rebalance small dask.Array dataset for better performance.
# rebalance small dask.Array dataset for better performance.
if
output
==
'
array
'
:
if
output
==
"
array
"
:
dX
=
dX
.
persist
()
dX
=
dX
.
persist
()
dy
=
dy
.
persist
()
dy
=
dy
.
persist
()
dw
=
dw
.
persist
()
dw
=
dw
.
persist
()
...
@@ -917,15 +778,10 @@ def test_ranker_custom_objective(output, cluster):
...
@@ -917,15 +778,10 @@ def test_ranker_custom_objective(output, cluster):
"n_estimators"
:
50
,
"n_estimators"
:
50
,
"num_leaves"
:
20
,
"num_leaves"
:
20
,
"min_child_samples"
:
1
,
"min_child_samples"
:
1
,
"objective"
:
_objective_least_squares
"objective"
:
_objective_least_squares
,
}
}
dask_ranker
=
lgb
.
DaskLGBMRanker
(
dask_ranker
=
lgb
.
DaskLGBMRanker
(
client
=
client
,
time_out
=
5
,
tree_learner_type
=
"data"
,
**
params
)
client
=
client
,
time_out
=
5
,
tree_learner_type
=
"data"
,
**
params
)
dask_ranker
=
dask_ranker
.
fit
(
dX
,
dy
,
sample_weight
=
dw
,
group
=
dg
)
dask_ranker
=
dask_ranker
.
fit
(
dX
,
dy
,
sample_weight
=
dw
,
group
=
dg
)
rnkvec_dask
=
dask_ranker
.
predict
(
dX
).
compute
()
rnkvec_dask
=
dask_ranker
.
predict
(
dX
).
compute
()
dask_ranker_local
=
dask_ranker
.
to_local
()
dask_ranker_local
=
dask_ranker
.
to_local
()
...
@@ -946,13 +802,13 @@ def test_ranker_custom_objective(output, cluster):
...
@@ -946,13 +802,13 @@ def test_ranker_custom_objective(output, cluster):
assert
callable
(
dask_ranker_local
.
objective_
)
assert
callable
(
dask_ranker_local
.
objective_
)
@
pytest
.
mark
.
parametrize
(
'
task
'
,
tasks
)
@
pytest
.
mark
.
parametrize
(
"
task
"
,
tasks
)
@
pytest
.
mark
.
parametrize
(
'
output
'
,
data_output
)
@
pytest
.
mark
.
parametrize
(
"
output
"
,
data_output
)
@
pytest
.
mark
.
parametrize
(
'
eval_sizes
'
,
[[
0.5
,
1
,
1.5
],
[
0
]])
@
pytest
.
mark
.
parametrize
(
"
eval_sizes
"
,
[[
0.5
,
1
,
1.5
],
[
0
]])
@
pytest
.
mark
.
parametrize
(
'
eval_names_prefix
'
,
[
'
specified
'
,
None
])
@
pytest
.
mark
.
parametrize
(
"
eval_names_prefix
"
,
[
"
specified
"
,
None
])
def
test_eval_set_no_early_stopping
(
task
,
output
,
eval_sizes
,
eval_names_prefix
,
cluster
):
def
test_eval_set_no_early_stopping
(
task
,
output
,
eval_sizes
,
eval_names_prefix
,
cluster
):
if
task
==
'
ranking
'
and
output
==
'
scipy_csr_matrix
'
:
if
task
==
"
ranking
"
and
output
==
"
scipy_csr_matrix
"
:
pytest
.
skip
(
'
LGBMRanker is not currently tested on sparse matrices
'
)
pytest
.
skip
(
"
LGBMRanker is not currently tested on sparse matrices
"
)
with
Client
(
cluster
)
as
client
:
with
Client
(
cluster
)
as
client
:
# Use larger trainset to prevent premature stopping due to zero loss, causing num_trees() < n_estimators.
# Use larger trainset to prevent premature stopping due to zero loss, causing num_trees() < n_estimators.
...
@@ -966,36 +822,33 @@ def test_eval_set_no_early_stopping(task, output, eval_sizes, eval_names_prefix,
...
@@ -966,36 +822,33 @@ def test_eval_set_no_early_stopping(task, output, eval_sizes, eval_names_prefix,
eval_init_score
=
None
eval_init_score
=
None
if
eval_names_prefix
:
if
eval_names_prefix
:
eval_names
=
[
f
'
{
eval_names_prefix
}
_
{
i
}
'
for
i
in
range
(
len
(
eval_sizes
))]
eval_names
=
[
f
"
{
eval_names_prefix
}
_
{
i
}
"
for
i
in
range
(
len
(
eval_sizes
))]
else
:
else
:
eval_names
=
None
eval_names
=
None
X
,
y
,
w
,
g
,
dX
,
dy
,
dw
,
dg
=
_create_data
(
X
,
y
,
w
,
g
,
dX
,
dy
,
dw
,
dg
=
_create_data
(
objective
=
task
,
objective
=
task
,
n_samples
=
n_samples
,
output
=
output
,
chunk_size
=
chunk_size
n_samples
=
n_samples
,
output
=
output
,
chunk_size
=
chunk_size
)
)
if
task
==
'
ranking
'
:
if
task
==
"
ranking
"
:
eval_metrics
=
[
'
ndcg
'
]
eval_metrics
=
[
"
ndcg
"
]
eval_at
=
(
5
,
6
)
eval_at
=
(
5
,
6
)
eval_metric_names
=
[
f
'
ndcg@
{
k
}
'
for
k
in
eval_at
]
eval_metric_names
=
[
f
"
ndcg@
{
k
}
"
for
k
in
eval_at
]
eval_group
=
[]
eval_group
=
[]
else
:
else
:
# test eval_class_weight, eval_init_score on binary-classification task.
# test eval_class_weight, eval_init_score on binary-classification task.
# Note: objective's default `metric` will be evaluated in evals_result_ in addition to all eval_metrics.
# Note: objective's default `metric` will be evaluated in evals_result_ in addition to all eval_metrics.
if
task
==
'
binary-classification
'
:
if
task
==
"
binary-classification
"
:
eval_metrics
=
[
'
binary_error
'
,
'
auc
'
]
eval_metrics
=
[
"
binary_error
"
,
"
auc
"
]
eval_metric_names
=
[
'
binary_logloss
'
,
'
binary_error
'
,
'
auc
'
]
eval_metric_names
=
[
"
binary_logloss
"
,
"
binary_error
"
,
"
auc
"
]
eval_class_weight
=
[]
eval_class_weight
=
[]
eval_init_score
=
[]
eval_init_score
=
[]
elif
task
==
'
multiclass-classification
'
:
elif
task
==
"
multiclass-classification
"
:
eval_metrics
=
[
'
multi_error
'
]
eval_metrics
=
[
"
multi_error
"
]
eval_metric_names
=
[
'
multi_logloss
'
,
'
multi_error
'
]
eval_metric_names
=
[
"
multi_logloss
"
,
"
multi_error
"
]
elif
task
==
'
regression
'
:
elif
task
==
"
regression
"
:
eval_metrics
=
[
'
l1
'
]
eval_metrics
=
[
"
l1
"
]
eval_metric_names
=
[
'
l2
'
,
'
l1
'
]
eval_metric_names
=
[
"
l2
"
,
"
l1
"
]
# create eval_sets by creating new datasets or copying training data.
# create eval_sets by creating new datasets or copying training data.
for
eval_size
in
eval_sizes
:
for
eval_size
in
eval_sizes
:
...
@@ -1008,23 +861,20 @@ def test_eval_set_no_early_stopping(task, output, eval_sizes, eval_names_prefix,
...
@@ -1008,23 +861,20 @@ def test_eval_set_no_early_stopping(task, output, eval_sizes, eval_names_prefix,
else
:
else
:
n_eval_samples
=
max
(
chunk_size
,
int
(
n_samples
*
eval_size
))
n_eval_samples
=
max
(
chunk_size
,
int
(
n_samples
*
eval_size
))
_
,
y_e
,
_
,
_
,
dX_e
,
dy_e
,
dw_e
,
dg_e
=
_create_data
(
_
,
y_e
,
_
,
_
,
dX_e
,
dy_e
,
dw_e
,
dg_e
=
_create_data
(
objective
=
task
,
objective
=
task
,
n_samples
=
n_eval_samples
,
output
=
output
,
chunk_size
=
chunk_size
n_samples
=
n_eval_samples
,
output
=
output
,
chunk_size
=
chunk_size
)
)
eval_set
.
append
((
dX_e
,
dy_e
))
eval_set
.
append
((
dX_e
,
dy_e
))
eval_sample_weight
.
append
(
dw_e
)
eval_sample_weight
.
append
(
dw_e
)
if
task
==
'
ranking
'
:
if
task
==
"
ranking
"
:
eval_group
.
append
(
dg_e
)
eval_group
.
append
(
dg_e
)
if
task
==
'
binary-classification
'
:
if
task
==
"
binary-classification
"
:
n_neg
=
np
.
sum
(
y_e
==
0
)
n_neg
=
np
.
sum
(
y_e
==
0
)
n_pos
=
np
.
sum
(
y_e
==
1
)
n_pos
=
np
.
sum
(
y_e
==
1
)
eval_class_weight
.
append
({
0
:
n_neg
/
n_pos
,
1
:
n_pos
/
n_neg
})
eval_class_weight
.
append
({
0
:
n_neg
/
n_pos
,
1
:
n_pos
/
n_neg
})
init_score_value
=
np
.
log
(
np
.
mean
(
y_e
)
/
(
1
-
np
.
mean
(
y_e
)))
init_score_value
=
np
.
log
(
np
.
mean
(
y_e
)
/
(
1
-
np
.
mean
(
y_e
)))
if
'
dataframe
'
in
output
:
if
"
dataframe
"
in
output
:
d_init_score
=
dy_e
.
map_partitions
(
lambda
x
,
val
=
init_score_value
:
pd
.
Series
([
val
]
*
x
.
size
))
d_init_score
=
dy_e
.
map_partitions
(
lambda
x
,
val
=
init_score_value
:
pd
.
Series
([
val
]
*
x
.
size
))
else
:
else
:
d_init_score
=
dy_e
.
map_blocks
(
lambda
x
,
val
=
init_score_value
:
np
.
repeat
(
val
,
x
.
size
))
d_init_score
=
dy_e
.
map_blocks
(
lambda
x
,
val
=
init_score_value
:
np
.
repeat
(
val
,
x
.
size
))
...
@@ -1032,44 +882,36 @@ def test_eval_set_no_early_stopping(task, output, eval_sizes, eval_names_prefix,
...
@@ -1032,44 +882,36 @@ def test_eval_set_no_early_stopping(task, output, eval_sizes, eval_names_prefix,
eval_init_score
.
append
(
d_init_score
)
eval_init_score
.
append
(
d_init_score
)
fit_trees
=
50
fit_trees
=
50
params
=
{
params
=
{
"random_state"
:
42
,
"n_estimators"
:
fit_trees
,
"num_leaves"
:
2
}
"random_state"
:
42
,
"n_estimators"
:
fit_trees
,
"num_leaves"
:
2
}
model_factory
=
task_to_dask_factory
[
task
]
model_factory
=
task_to_dask_factory
[
task
]
dask_model
=
model_factory
(
dask_model
=
model_factory
(
client
=
client
,
**
params
)
client
=
client
,
**
params
)
fit_params
=
{
fit_params
=
{
'X'
:
dX
,
"X"
:
dX
,
'y'
:
dy
,
"y"
:
dy
,
'
eval_set
'
:
eval_set
,
"
eval_set
"
:
eval_set
,
'
eval_names
'
:
eval_names
,
"
eval_names
"
:
eval_names
,
'
eval_sample_weight
'
:
eval_sample_weight
,
"
eval_sample_weight
"
:
eval_sample_weight
,
'
eval_init_score
'
:
eval_init_score
,
"
eval_init_score
"
:
eval_init_score
,
'
eval_metric
'
:
eval_metrics
"
eval_metric
"
:
eval_metrics
,
}
}
if
task
==
'ranking'
:
if
task
==
"ranking"
:
fit_params
.
update
(
fit_params
.
update
({
"group"
:
dg
,
"eval_group"
:
eval_group
,
"eval_at"
:
eval_at
})
{
'group'
:
dg
,
elif
task
==
"binary-classification"
:
'eval_group'
:
eval_group
,
fit_params
.
update
({
"eval_class_weight"
:
eval_class_weight
})
'eval_at'
:
eval_at
}
)
elif
task
==
'binary-classification'
:
fit_params
.
update
({
'eval_class_weight'
:
eval_class_weight
})
if
eval_sizes
==
[
0
]:
if
eval_sizes
==
[
0
]:
with
pytest
.
warns
(
UserWarning
,
match
=
'Worker (.*) was not allocated eval_set data. Therefore evals_result_ and best_score_ data may be unreliable.'
):
with
pytest
.
warns
(
UserWarning
,
match
=
"Worker (.*) was not allocated eval_set data. Therefore evals_result_ and best_score_ data may be unreliable."
,
):
dask_model
.
fit
(
**
fit_params
)
dask_model
.
fit
(
**
fit_params
)
else
:
else
:
dask_model
=
dask_model
.
fit
(
**
fit_params
)
dask_model
=
dask_model
.
fit
(
**
fit_params
)
# total number of trees scales up for ova classifier.
# total number of trees scales up for ova classifier.
if
task
==
'
multiclass-classification
'
:
if
task
==
"
multiclass-classification
"
:
model_trees
=
fit_trees
*
dask_model
.
n_classes_
model_trees
=
fit_trees
*
dask_model
.
n_classes_
else
:
else
:
model_trees
=
fit_trees
model_trees
=
fit_trees
...
@@ -1098,67 +940,45 @@ def test_eval_set_no_early_stopping(task, output, eval_sizes, eval_names_prefix,
...
@@ -1098,67 +940,45 @@ def test_eval_set_no_early_stopping(task, output, eval_sizes, eval_names_prefix,
assert
len
(
evals_result
[
eval_name
][
metric
])
==
fit_trees
assert
len
(
evals_result
[
eval_name
][
metric
])
==
fit_trees
@
pytest
.
mark
.
parametrize
(
'
task
'
,
[
'
binary-classification
'
,
'
regression
'
,
'
ranking
'
])
@
pytest
.
mark
.
parametrize
(
"
task
"
,
[
"
binary-classification
"
,
"
regression
"
,
"
ranking
"
])
def
test_eval_set_with_custom_eval_metric
(
task
,
cluster
):
def
test_eval_set_with_custom_eval_metric
(
task
,
cluster
):
with
Client
(
cluster
)
as
client
:
with
Client
(
cluster
)
as
client
:
n_samples
=
1000
n_samples
=
1000
n_eval_samples
=
int
(
n_samples
*
0.5
)
n_eval_samples
=
int
(
n_samples
*
0.5
)
chunk_size
=
10
chunk_size
=
10
output
=
'
array
'
output
=
"
array
"
X
,
y
,
w
,
g
,
dX
,
dy
,
dw
,
dg
=
_create_data
(
X
,
y
,
w
,
g
,
dX
,
dy
,
dw
,
dg
=
_create_data
(
objective
=
task
,
objective
=
task
,
n_samples
=
n_samples
,
output
=
output
,
chunk_size
=
chunk_size
n_samples
=
n_samples
,
output
=
output
,
chunk_size
=
chunk_size
)
)
_
,
_
,
_
,
_
,
dX_e
,
dy_e
,
_
,
dg_e
=
_create_data
(
_
,
_
,
_
,
_
,
dX_e
,
dy_e
,
_
,
dg_e
=
_create_data
(
objective
=
task
,
objective
=
task
,
n_samples
=
n_eval_samples
,
output
=
output
,
chunk_size
=
chunk_size
n_samples
=
n_eval_samples
,
output
=
output
,
chunk_size
=
chunk_size
)
)
if
task
==
'
ranking
'
:
if
task
==
"
ranking
"
:
eval_at
=
(
5
,
6
)
eval_at
=
(
5
,
6
)
eval_metrics
=
[
'
ndcg
'
,
_constant_metric
]
eval_metrics
=
[
"
ndcg
"
,
_constant_metric
]
eval_metric_names
=
[
f
'
ndcg@
{
k
}
'
for
k
in
eval_at
]
+
[
'
constant_metric
'
]
eval_metric_names
=
[
f
"
ndcg@
{
k
}
"
for
k
in
eval_at
]
+
[
"
constant_metric
"
]
elif
task
==
'
binary-classification
'
:
elif
task
==
"
binary-classification
"
:
eval_metrics
=
[
'
binary_error
'
,
'
auc
'
,
_constant_metric
]
eval_metrics
=
[
"
binary_error
"
,
"
auc
"
,
_constant_metric
]
eval_metric_names
=
[
'
binary_logloss
'
,
'
binary_error
'
,
'
auc
'
,
'
constant_metric
'
]
eval_metric_names
=
[
"
binary_logloss
"
,
"
binary_error
"
,
"
auc
"
,
"
constant_metric
"
]
else
:
else
:
eval_metrics
=
[
'
l1
'
,
_constant_metric
]
eval_metrics
=
[
"
l1
"
,
_constant_metric
]
eval_metric_names
=
[
'
l2
'
,
'
l1
'
,
'
constant_metric
'
]
eval_metric_names
=
[
"
l2
"
,
"
l1
"
,
"
constant_metric
"
]
fit_trees
=
50
fit_trees
=
50
params
=
{
params
=
{
"random_state"
:
42
,
"n_estimators"
:
fit_trees
,
"num_leaves"
:
2
}
"random_state"
:
42
,
"n_estimators"
:
fit_trees
,
"num_leaves"
:
2
}
model_factory
=
task_to_dask_factory
[
task
]
model_factory
=
task_to_dask_factory
[
task
]
dask_model
=
model_factory
(
dask_model
=
model_factory
(
client
=
client
,
**
params
)
client
=
client
,
**
params
)
eval_set
=
[(
dX_e
,
dy_e
)]
eval_set
=
[(
dX_e
,
dy_e
)]
fit_params
=
{
fit_params
=
{
"X"
:
dX
,
"y"
:
dy
,
"eval_set"
:
eval_set
,
"eval_metric"
:
eval_metrics
}
'X'
:
dX
,
if
task
==
"ranking"
:
'y'
:
dy
,
fit_params
.
update
({
"group"
:
dg
,
"eval_group"
:
[
dg_e
],
"eval_at"
:
eval_at
})
'eval_set'
:
eval_set
,
'eval_metric'
:
eval_metrics
}
if
task
==
'ranking'
:
fit_params
.
update
(
{
'group'
:
dg
,
'eval_group'
:
[
dg_e
],
'eval_at'
:
eval_at
}
)
dask_model
=
dask_model
.
fit
(
**
fit_params
)
dask_model
=
dask_model
.
fit
(
**
fit_params
)
eval_name
=
'
valid_0
'
eval_name
=
"
valid_0
"
evals_result
=
dask_model
.
evals_result_
evals_result
=
dask_model
.
evals_result_
assert
len
(
evals_result
)
==
1
assert
len
(
evals_result
)
==
1
assert
eval_name
in
evals_result
assert
eval_name
in
evals_result
...
@@ -1167,29 +987,21 @@ def test_eval_set_with_custom_eval_metric(task, cluster):
...
@@ -1167,29 +987,21 @@ def test_eval_set_with_custom_eval_metric(task, cluster):
assert
metric
in
evals_result
[
eval_name
]
assert
metric
in
evals_result
[
eval_name
]
assert
len
(
evals_result
[
eval_name
][
metric
])
==
fit_trees
assert
len
(
evals_result
[
eval_name
][
metric
])
==
fit_trees
np
.
testing
.
assert_allclose
(
evals_result
[
eval_name
][
'
constant_metric
'
],
0.708
)
np
.
testing
.
assert_allclose
(
evals_result
[
eval_name
][
"
constant_metric
"
],
0.708
)
@
pytest
.
mark
.
parametrize
(
'
task
'
,
tasks
)
@
pytest
.
mark
.
parametrize
(
"
task
"
,
tasks
)
def
test_training_works_if_client_not_provided_or_set_after_construction
(
task
,
cluster
):
def
test_training_works_if_client_not_provided_or_set_after_construction
(
task
,
cluster
):
with
Client
(
cluster
)
as
client
:
with
Client
(
cluster
)
as
client
:
_
,
_
,
_
,
_
,
dX
,
dy
,
_
,
dg
=
_create_data
(
_
,
_
,
_
,
_
,
dX
,
dy
,
_
,
dg
=
_create_data
(
objective
=
task
,
output
=
"array"
,
group
=
None
)
objective
=
task
,
output
=
'array'
,
group
=
None
)
model_factory
=
task_to_dask_factory
[
task
]
model_factory
=
task_to_dask_factory
[
task
]
params
=
{
params
=
{
"time_out"
:
5
,
"n_estimators"
:
1
,
"num_leaves"
:
2
}
"time_out"
:
5
,
"n_estimators"
:
1
,
"num_leaves"
:
2
}
# should be able to use the class without specifying a client
# should be able to use the class without specifying a client
dask_model
=
model_factory
(
**
params
)
dask_model
=
model_factory
(
**
params
)
assert
dask_model
.
client
is
None
assert
dask_model
.
client
is
None
with
pytest
.
raises
(
lgb
.
compat
.
LGBMNotFittedError
,
match
=
'
Cannot access property client_ before calling fit
'
):
with
pytest
.
raises
(
lgb
.
compat
.
LGBMNotFittedError
,
match
=
"
Cannot access property client_ before calling fit
"
):
dask_model
.
client_
dask_model
.
client_
dask_model
.
fit
(
dX
,
dy
,
group
=
dg
)
dask_model
.
fit
(
dX
,
dy
,
group
=
dg
)
...
@@ -1213,7 +1025,7 @@ def test_training_works_if_client_not_provided_or_set_after_construction(task, c
...
@@ -1213,7 +1025,7 @@ def test_training_works_if_client_not_provided_or_set_after_construction(task, c
dask_model
.
set_params
(
client
=
client
)
dask_model
.
set_params
(
client
=
client
)
assert
dask_model
.
client
==
client
assert
dask_model
.
client
==
client
with
pytest
.
raises
(
lgb
.
compat
.
LGBMNotFittedError
,
match
=
'
Cannot access property client_ before calling fit
'
):
with
pytest
.
raises
(
lgb
.
compat
.
LGBMNotFittedError
,
match
=
"
Cannot access property client_ before calling fit
"
):
dask_model
.
client_
dask_model
.
client_
dask_model
.
fit
(
dX
,
dy
,
group
=
dg
)
dask_model
.
fit
(
dX
,
dy
,
group
=
dg
)
...
@@ -1233,34 +1045,23 @@ def test_training_works_if_client_not_provided_or_set_after_construction(task, c
...
@@ -1233,34 +1045,23 @@ def test_training_works_if_client_not_provided_or_set_after_construction(task, c
local_model
.
client_
local_model
.
client_
@
pytest
.
mark
.
parametrize
(
'serializer'
,
[
'pickle'
,
'joblib'
,
'cloudpickle'
])
@
pytest
.
mark
.
parametrize
(
"serializer"
,
[
"pickle"
,
"joblib"
,
"cloudpickle"
])
@
pytest
.
mark
.
parametrize
(
'task'
,
tasks
)
@
pytest
.
mark
.
parametrize
(
"task"
,
tasks
)
@
pytest
.
mark
.
parametrize
(
'set_client'
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"set_client"
,
[
True
,
False
])
def
test_model_and_local_version_are_picklable_whether_or_not_client_set_explicitly
(
serializer
,
task
,
set_client
,
tmp_path
,
cluster
,
cluster2
):
def
test_model_and_local_version_are_picklable_whether_or_not_client_set_explicitly
(
serializer
,
task
,
set_client
,
tmp_path
,
cluster
,
cluster2
):
with
Client
(
cluster
)
as
client1
:
with
Client
(
cluster
)
as
client1
:
# data on cluster1
# data on cluster1
X_1
,
_
,
_
,
_
,
dX_1
,
dy_1
,
_
,
dg_1
=
_create_data
(
X_1
,
_
,
_
,
_
,
dX_1
,
dy_1
,
_
,
dg_1
=
_create_data
(
objective
=
task
,
output
=
"array"
,
group
=
None
)
objective
=
task
,
output
=
'array'
,
group
=
None
)
with
Client
(
cluster2
)
as
client2
:
with
Client
(
cluster2
)
as
client2
:
# create identical data on cluster2
# create identical data on cluster2
X_2
,
_
,
_
,
_
,
dX_2
,
dy_2
,
_
,
dg_2
=
_create_data
(
X_2
,
_
,
_
,
_
,
dX_2
,
dy_2
,
_
,
dg_2
=
_create_data
(
objective
=
task
,
output
=
"array"
,
group
=
None
)
objective
=
task
,
output
=
'array'
,
group
=
None
)
model_factory
=
task_to_dask_factory
[
task
]
model_factory
=
task_to_dask_factory
[
task
]
params
=
{
params
=
{
"time_out"
:
5
,
"n_estimators"
:
1
,
"num_leaves"
:
2
}
"time_out"
:
5
,
"n_estimators"
:
1
,
"num_leaves"
:
2
}
# at this point, the result of default_client() is client2 since it was the most recently
# at this point, the result of default_client() is client2 since it was the most recently
# created. So setting client to client1 here to test that you can select a non-default client
# created. So setting client to client1 here to test that you can select a non-default client
...
@@ -1277,33 +1078,21 @@ def test_model_and_local_version_are_picklable_whether_or_not_client_set_explici
...
@@ -1277,33 +1078,21 @@ def test_model_and_local_version_are_picklable_whether_or_not_client_set_explici
else
:
else
:
assert
dask_model
.
client
is
None
assert
dask_model
.
client
is
None
with
pytest
.
raises
(
lgb
.
compat
.
LGBMNotFittedError
,
match
=
'Cannot access property client_ before calling fit'
):
with
pytest
.
raises
(
lgb
.
compat
.
LGBMNotFittedError
,
match
=
"Cannot access property client_ before calling fit"
):
dask_model
.
client_
dask_model
.
client_
assert
"client"
not
in
local_model
.
get_params
()
assert
"client"
not
in
local_model
.
get_params
()
assert
getattr
(
local_model
,
"client"
,
None
)
is
None
assert
getattr
(
local_model
,
"client"
,
None
)
is
None
tmp_file
=
tmp_path
/
"model-1.pkl"
tmp_file
=
tmp_path
/
"model-1.pkl"
pickle_obj
(
pickle_obj
(
obj
=
dask_model
,
filepath
=
tmp_file
,
serializer
=
serializer
)
obj
=
dask_model
,
model_from_disk
=
unpickle_obj
(
filepath
=
tmp_file
,
serializer
=
serializer
)
filepath
=
tmp_file
,
serializer
=
serializer
)
model_from_disk
=
unpickle_obj
(
filepath
=
tmp_file
,
serializer
=
serializer
)
local_tmp_file
=
tmp_path
/
"local-model-1.pkl"
local_tmp_file
=
tmp_path
/
"local-model-1.pkl"
pickle_obj
(
pickle_obj
(
obj
=
local_model
,
filepath
=
local_tmp_file
,
serializer
=
serializer
)
obj
=
local_model
,
local_model_from_disk
=
unpickle_obj
(
filepath
=
local_tmp_file
,
serializer
=
serializer
)
filepath
=
local_tmp_file
,
serializer
=
serializer
)
local_model_from_disk
=
unpickle_obj
(
filepath
=
local_tmp_file
,
serializer
=
serializer
)
assert
model_from_disk
.
client
is
None
assert
model_from_disk
.
client
is
None
...
@@ -1312,7 +1101,9 @@ def test_model_and_local_version_are_picklable_whether_or_not_client_set_explici
...
@@ -1312,7 +1101,9 @@ def test_model_and_local_version_are_picklable_whether_or_not_client_set_explici
else
:
else
:
assert
dask_model
.
client
is
None
assert
dask_model
.
client
is
None
with
pytest
.
raises
(
lgb
.
compat
.
LGBMNotFittedError
,
match
=
'Cannot access property client_ before calling fit'
):
with
pytest
.
raises
(
lgb
.
compat
.
LGBMNotFittedError
,
match
=
"Cannot access property client_ before calling fit"
):
dask_model
.
client_
dask_model
.
client_
# client will always be None after unpickling
# client will always be None after unpickling
...
@@ -1340,26 +1131,12 @@ def test_model_and_local_version_are_picklable_whether_or_not_client_set_explici
...
@@ -1340,26 +1131,12 @@ def test_model_and_local_version_are_picklable_whether_or_not_client_set_explici
local_model
.
client_
local_model
.
client_
tmp_file2
=
tmp_path
/
"model-2.pkl"
tmp_file2
=
tmp_path
/
"model-2.pkl"
pickle_obj
(
pickle_obj
(
obj
=
dask_model
,
filepath
=
tmp_file2
,
serializer
=
serializer
)
obj
=
dask_model
,
fitted_model_from_disk
=
unpickle_obj
(
filepath
=
tmp_file2
,
serializer
=
serializer
)
filepath
=
tmp_file2
,
serializer
=
serializer
)
fitted_model_from_disk
=
unpickle_obj
(
filepath
=
tmp_file2
,
serializer
=
serializer
)
local_tmp_file2
=
tmp_path
/
"local-model-2.pkl"
local_tmp_file2
=
tmp_path
/
"local-model-2.pkl"
pickle_obj
(
pickle_obj
(
obj
=
local_model
,
filepath
=
local_tmp_file2
,
serializer
=
serializer
)
obj
=
local_model
,
local_fitted_model_from_disk
=
unpickle_obj
(
filepath
=
local_tmp_file2
,
serializer
=
serializer
)
filepath
=
local_tmp_file2
,
serializer
=
serializer
)
local_fitted_model_from_disk
=
unpickle_obj
(
filepath
=
local_tmp_file2
,
serializer
=
serializer
)
if
set_client
:
if
set_client
:
assert
dask_model
.
client
==
client1
assert
dask_model
.
client
==
client1
...
@@ -1405,35 +1182,25 @@ def test_warns_and_continues_on_unrecognized_tree_learner(cluster):
...
@@ -1405,35 +1182,25 @@ def test_warns_and_continues_on_unrecognized_tree_learner(cluster):
X
=
da
.
random
.
random
((
1e3
,
10
))
X
=
da
.
random
.
random
((
1e3
,
10
))
y
=
da
.
random
.
random
((
1e3
,
1
))
y
=
da
.
random
.
random
((
1e3
,
1
))
dask_regressor
=
lgb
.
DaskLGBMRegressor
(
dask_regressor
=
lgb
.
DaskLGBMRegressor
(
client
=
client
,
client
=
client
,
time_out
=
5
,
tree_learner
=
"some-nonsense-value"
,
n_estimators
=
1
,
num_leaves
=
2
time_out
=
5
,
tree_learner
=
'some-nonsense-value'
,
n_estimators
=
1
,
num_leaves
=
2
)
)
with
pytest
.
warns
(
UserWarning
,
match
=
'
Parameter tree_learner set to some-nonsense-value
'
):
with
pytest
.
warns
(
UserWarning
,
match
=
"
Parameter tree_learner set to some-nonsense-value
"
):
dask_regressor
=
dask_regressor
.
fit
(
X
,
y
)
dask_regressor
=
dask_regressor
.
fit
(
X
,
y
)
assert
dask_regressor
.
fitted_
assert
dask_regressor
.
fitted_
@
pytest
.
mark
.
parametrize
(
'
tree_learner
'
,
[
'
data_parallel
'
,
'
voting_parallel
'
])
@
pytest
.
mark
.
parametrize
(
"
tree_learner
"
,
[
"
data_parallel
"
,
"
voting_parallel
"
])
def
test_training_respects_tree_learner_aliases
(
tree_learner
,
cluster
):
def
test_training_respects_tree_learner_aliases
(
tree_learner
,
cluster
):
with
Client
(
cluster
)
as
client
:
with
Client
(
cluster
)
as
client
:
task
=
'
regression
'
task
=
"
regression
"
_
,
_
,
_
,
_
,
dX
,
dy
,
dw
,
dg
=
_create_data
(
objective
=
task
,
output
=
'
array
'
)
_
,
_
,
_
,
_
,
dX
,
dy
,
dw
,
dg
=
_create_data
(
objective
=
task
,
output
=
"
array
"
)
dask_factory
=
task_to_dask_factory
[
task
]
dask_factory
=
task_to_dask_factory
[
task
]
dask_model
=
dask_factory
(
dask_model
=
dask_factory
(
client
=
client
,
tree_learner
=
tree_learner
,
time_out
=
5
,
n_estimators
=
10
,
num_leaves
=
15
)
client
=
client
,
tree_learner
=
tree_learner
,
time_out
=
5
,
n_estimators
=
10
,
num_leaves
=
15
)
dask_model
.
fit
(
dX
,
dy
,
sample_weight
=
dw
,
group
=
dg
)
dask_model
.
fit
(
dX
,
dy
,
sample_weight
=
dw
,
group
=
dg
)
assert
dask_model
.
fitted_
assert
dask_model
.
fitted_
assert
dask_model
.
get_params
()[
'
tree_learner
'
]
==
tree_learner
assert
dask_model
.
get_params
()[
"
tree_learner
"
]
==
tree_learner
def
test_error_on_feature_parallel_tree_learner
(
cluster
):
def
test_error_on_feature_parallel_tree_learner
(
cluster
):
...
@@ -1444,39 +1211,30 @@ def test_error_on_feature_parallel_tree_learner(cluster):
...
@@ -1444,39 +1211,30 @@ def test_error_on_feature_parallel_tree_learner(cluster):
_
=
wait
([
X
,
y
])
_
=
wait
([
X
,
y
])
client
.
rebalance
()
client
.
rebalance
()
dask_regressor
=
lgb
.
DaskLGBMRegressor
(
dask_regressor
=
lgb
.
DaskLGBMRegressor
(
client
=
client
,
client
=
client
,
time_out
=
5
,
tree_learner
=
"feature_parallel"
,
n_estimators
=
1
,
num_leaves
=
2
time_out
=
5
,
tree_learner
=
'feature_parallel'
,
n_estimators
=
1
,
num_leaves
=
2
)
)
with
pytest
.
raises
(
lgb
.
basic
.
LightGBMError
,
match
=
'
Do not support feature parallel in c api
'
):
with
pytest
.
raises
(
lgb
.
basic
.
LightGBMError
,
match
=
"
Do not support feature parallel in c api
"
):
dask_regressor
=
dask_regressor
.
fit
(
X
,
y
)
dask_regressor
=
dask_regressor
.
fit
(
X
,
y
)
def
test_errors
(
cluster
):
def
test_errors
(
cluster
):
with
Client
(
cluster
)
as
client
:
with
Client
(
cluster
)
as
client
:
def
f
(
part
):
def
f
(
part
):
raise
Exception
(
'
foo
'
)
raise
Exception
(
"
foo
"
)
df
=
dd
.
demo
.
make_timeseries
()
df
=
dd
.
demo
.
make_timeseries
()
df
=
df
.
map_partitions
(
f
,
meta
=
df
.
_meta
)
df
=
df
.
map_partitions
(
f
,
meta
=
df
.
_meta
)
with
pytest
.
raises
(
Exception
)
as
info
:
with
pytest
.
raises
(
Exception
)
as
info
:
lgb
.
dask
.
_train
(
lgb
.
dask
.
_train
(
client
=
client
,
data
=
df
,
label
=
df
.
x
,
params
=
{},
model_factory
=
lgb
.
LGBMClassifier
)
client
=
client
,
assert
"foo"
in
str
(
info
.
value
)
data
=
df
,
label
=
df
.
x
,
params
=
{},
model_factory
=
lgb
.
LGBMClassifier
)
assert
'foo'
in
str
(
info
.
value
)
@
pytest
.
mark
.
parametrize
(
'
task
'
,
tasks
)
@
pytest
.
mark
.
parametrize
(
"
task
"
,
tasks
)
@
pytest
.
mark
.
parametrize
(
'
output
'
,
data_output
)
@
pytest
.
mark
.
parametrize
(
"
output
"
,
data_output
)
def
test_training_succeeds_even_if_some_workers_do_not_have_any_data
(
task
,
output
,
cluster_three_workers
):
def
test_training_succeeds_even_if_some_workers_do_not_have_any_data
(
task
,
output
,
cluster_three_workers
):
if
task
==
'
ranking
'
and
output
==
'
scipy_csr_matrix
'
:
if
task
==
"
ranking
"
and
output
==
"
scipy_csr_matrix
"
:
pytest
.
skip
(
'
LGBMRanker is not currently tested on sparse matrices
'
)
pytest
.
skip
(
"
LGBMRanker is not currently tested on sparse matrices
"
)
with
Client
(
cluster_three_workers
)
as
client
:
with
Client
(
cluster_three_workers
)
as
client
:
_
,
y
,
_
,
_
,
dX
,
dy
,
dw
,
dg
=
_create_data
(
_
,
y
,
_
,
_
,
dX
,
dy
,
dw
,
dg
=
_create_data
(
...
@@ -1489,7 +1247,7 @@ def test_training_succeeds_even_if_some_workers_do_not_have_any_data(task, outpu
...
@@ -1489,7 +1247,7 @@ def test_training_succeeds_even_if_some_workers_do_not_have_any_data(task, outpu
dask_model_factory
=
task_to_dask_factory
[
task
]
dask_model_factory
=
task_to_dask_factory
[
task
]
workers
=
list
(
client
.
scheduler_info
()[
'
workers
'
].
keys
())
workers
=
list
(
client
.
scheduler_info
()[
"
workers
"
].
keys
())
assert
len
(
workers
)
==
3
assert
len
(
workers
)
==
3
first_two_workers
=
workers
[:
2
]
first_two_workers
=
workers
[:
2
]
...
@@ -1506,33 +1264,28 @@ def test_training_succeeds_even_if_some_workers_do_not_have_any_data(task, outpu
...
@@ -1506,33 +1264,28 @@ def test_training_succeeds_even_if_some_workers_do_not_have_any_data(task, outpu
assert
len
(
workers_with_data
)
==
2
assert
len
(
workers_with_data
)
==
2
params
=
{
params
=
{
'
time_out
'
:
5
,
"
time_out
"
:
5
,
'
random_state
'
:
42
,
"
random_state
"
:
42
,
'
num_leaves
'
:
10
,
"
num_leaves
"
:
10
,
'
n_estimators
'
:
20
,
"
n_estimators
"
:
20
,
}
}
dask_model
=
dask_model_factory
(
tree
=
'
data
'
,
client
=
client
,
**
params
)
dask_model
=
dask_model_factory
(
tree
=
"
data
"
,
client
=
client
,
**
params
)
dask_model
.
fit
(
dX
,
dy
,
group
=
dg
,
sample_weight
=
dw
)
dask_model
.
fit
(
dX
,
dy
,
group
=
dg
,
sample_weight
=
dw
)
dask_preds
=
dask_model
.
predict
(
dX
).
compute
()
dask_preds
=
dask_model
.
predict
(
dX
).
compute
()
if
task
==
'
regression
'
:
if
task
==
"
regression
"
:
score
=
r2_score
(
y
,
dask_preds
)
score
=
r2_score
(
y
,
dask_preds
)
elif
task
.
endswith
(
'
classification
'
):
elif
task
.
endswith
(
"
classification
"
):
score
=
accuracy_score
(
y
,
dask_preds
)
score
=
accuracy_score
(
y
,
dask_preds
)
else
:
else
:
score
=
spearmanr
(
dask_preds
,
y
).
correlation
score
=
spearmanr
(
dask_preds
,
y
).
correlation
assert
score
>
0.9
assert
score
>
0.9
@
pytest
.
mark
.
parametrize
(
'
task
'
,
tasks
)
@
pytest
.
mark
.
parametrize
(
"
task
"
,
tasks
)
def
test_network_params_not_required_but_respected_if_given
(
task
,
listen_port
,
cluster
):
def
test_network_params_not_required_but_respected_if_given
(
task
,
listen_port
,
cluster
):
with
Client
(
cluster
)
as
client
:
with
Client
(
cluster
)
as
client
:
_
,
_
,
_
,
_
,
dX
,
dy
,
_
,
dg
=
_create_data
(
_
,
_
,
_
,
_
,
dX
,
dy
,
_
,
dg
=
_create_data
(
objective
=
task
,
output
=
"array"
,
chunk_size
=
10
,
group
=
None
)
objective
=
task
,
output
=
'array'
,
chunk_size
=
10
,
group
=
None
)
dask_model_factory
=
task_to_dask_factory
[
task
]
dask_model_factory
=
task_to_dask_factory
[
task
]
...
@@ -1547,11 +1300,11 @@ def test_network_params_not_required_but_respected_if_given(task, listen_port, c
...
@@ -1547,11 +1300,11 @@ def test_network_params_not_required_but_respected_if_given(task, listen_port, c
dask_model1
.
fit
(
dX
,
dy
,
group
=
dg
)
dask_model1
.
fit
(
dX
,
dy
,
group
=
dg
)
assert
dask_model1
.
fitted_
assert
dask_model1
.
fitted_
params
=
dask_model1
.
get_params
()
params
=
dask_model1
.
get_params
()
assert
'
local_listen_port
'
not
in
params
assert
"
local_listen_port
"
not
in
params
assert
'
machines
'
not
in
params
assert
"
machines
"
not
in
params
# model 2 - machines given
# model 2 - machines given
workers
=
list
(
client
.
scheduler_info
()[
'
workers
'
])
workers
=
list
(
client
.
scheduler_info
()[
"
workers
"
])
workers_hostname
=
_get_workers_hostname
(
cluster
)
workers_hostname
=
_get_workers_hostname
(
cluster
)
remote_sockets
,
open_ports
=
lgb
.
dask
.
_assign_open_ports_to_workers
(
client
,
workers
)
remote_sockets
,
open_ports
=
lgb
.
dask
.
_assign_open_ports_to_workers
(
client
,
workers
)
for
s
in
remote_sockets
.
values
():
for
s
in
remote_sockets
.
values
():
...
@@ -1559,58 +1312,43 @@ def test_network_params_not_required_but_respected_if_given(task, listen_port, c
...
@@ -1559,58 +1312,43 @@ def test_network_params_not_required_but_respected_if_given(task, listen_port, c
dask_model2
=
dask_model_factory
(
dask_model2
=
dask_model_factory
(
n_estimators
=
5
,
n_estimators
=
5
,
num_leaves
=
5
,
num_leaves
=
5
,
machines
=
","
.
join
([
machines
=
","
.
join
([
f
"
{
workers_hostname
}
:
{
port
}
"
for
port
in
open_ports
.
values
()]),
f
"
{
workers_hostname
}
:
{
port
}
"
for
port
in
open_ports
.
values
()
]),
)
)
dask_model2
.
fit
(
dX
,
dy
,
group
=
dg
)
dask_model2
.
fit
(
dX
,
dy
,
group
=
dg
)
assert
dask_model2
.
fitted_
assert
dask_model2
.
fitted_
params
=
dask_model2
.
get_params
()
params
=
dask_model2
.
get_params
()
assert
'
local_listen_port
'
not
in
params
assert
"
local_listen_port
"
not
in
params
assert
'
machines
'
in
params
assert
"
machines
"
in
params
# model 3 - local_listen_port given
# model 3 - local_listen_port given
# training should fail because LightGBM will try to use the same
# training should fail because LightGBM will try to use the same
# port for multiple worker processes on the same machine
# port for multiple worker processes on the same machine
dask_model3
=
dask_model_factory
(
dask_model3
=
dask_model_factory
(
n_estimators
=
5
,
num_leaves
=
5
,
local_listen_port
=
listen_port
)
n_estimators
=
5
,
num_leaves
=
5
,
local_listen_port
=
listen_port
)
error_msg
=
"has multiple Dask worker processes running on it"
error_msg
=
"has multiple Dask worker processes running on it"
with
pytest
.
raises
(
lgb
.
basic
.
LightGBMError
,
match
=
error_msg
):
with
pytest
.
raises
(
lgb
.
basic
.
LightGBMError
,
match
=
error_msg
):
dask_model3
.
fit
(
dX
,
dy
,
group
=
dg
)
dask_model3
.
fit
(
dX
,
dy
,
group
=
dg
)
@
pytest
.
mark
.
parametrize
(
'
task
'
,
tasks
)
@
pytest
.
mark
.
parametrize
(
"
task
"
,
tasks
)
def
test_machines_should_be_used_if_provided
(
task
,
cluster
):
def
test_machines_should_be_used_if_provided
(
task
,
cluster
):
pytest
.
skip
(
"skipping due to timeout issues discussed in https://github.com/microsoft/LightGBM/issues/5390"
)
pytest
.
skip
(
"skipping due to timeout issues discussed in https://github.com/microsoft/LightGBM/issues/5390"
)
with
Client
(
cluster
)
as
client
:
with
Client
(
cluster
)
as
client
:
_
,
_
,
_
,
_
,
dX
,
dy
,
_
,
dg
=
_create_data
(
_
,
_
,
_
,
_
,
dX
,
dy
,
_
,
dg
=
_create_data
(
objective
=
task
,
output
=
"array"
,
chunk_size
=
10
,
group
=
None
)
objective
=
task
,
output
=
'array'
,
chunk_size
=
10
,
group
=
None
)
dask_model_factory
=
task_to_dask_factory
[
task
]
dask_model_factory
=
task_to_dask_factory
[
task
]
# rebalance data to be sure that each worker has a piece of the data
# rebalance data to be sure that each worker has a piece of the data
client
.
rebalance
()
client
.
rebalance
()
n_workers
=
len
(
client
.
scheduler_info
()[
'
workers
'
])
n_workers
=
len
(
client
.
scheduler_info
()[
"
workers
"
])
assert
n_workers
>
1
assert
n_workers
>
1
workers_hostname
=
_get_workers_hostname
(
cluster
)
workers_hostname
=
_get_workers_hostname
(
cluster
)
open_ports
=
lgb
.
dask
.
_find_n_open_ports
(
n_workers
)
open_ports
=
lgb
.
dask
.
_find_n_open_ports
(
n_workers
)
dask_model
=
dask_model_factory
(
dask_model
=
dask_model_factory
(
n_estimators
=
5
,
n_estimators
=
5
,
num_leaves
=
5
,
num_leaves
=
5
,
machines
=
","
.
join
([
machines
=
","
.
join
([
f
"
{
workers_hostname
}
:
{
port
}
"
for
port
in
open_ports
]),
f
"
{
workers_hostname
}
:
{
port
}
"
for
port
in
open_ports
]),
)
)
# test that "machines" is actually respected by creating a socket that uses
# test that "machines" is actually respected by creating a socket that uses
...
@@ -1626,12 +1364,7 @@ def test_machines_should_be_used_if_provided(task, cluster):
...
@@ -1626,12 +1364,7 @@ def test_machines_should_be_used_if_provided(task, cluster):
# an informative error should be raised if "machines" has duplicates
# an informative error should be raised if "machines" has duplicates
one_open_port
=
lgb
.
dask
.
_find_n_open_ports
(
1
)
one_open_port
=
lgb
.
dask
.
_find_n_open_ports
(
1
)
dask_model
.
set_params
(
dask_model
.
set_params
(
machines
=
","
.
join
([
f
"127.0.0.1:
{
one_open_port
}
"
for
_
in
range
(
n_workers
)]))
machines
=
","
.
join
([
f
"127.0.0.1:
{
one_open_port
}
"
for
_
in
range
(
n_workers
)
])
)
with
pytest
.
raises
(
ValueError
,
match
=
"Found duplicates in 'machines'"
):
with
pytest
.
raises
(
ValueError
,
match
=
"Found duplicates in 'machines'"
):
dask_model
.
fit
(
dX
,
dy
,
group
=
dg
)
dask_model
.
fit
(
dX
,
dy
,
group
=
dg
)
...
@@ -1641,8 +1374,8 @@ def test_machines_should_be_used_if_provided(task, cluster):
...
@@ -1641,8 +1374,8 @@ def test_machines_should_be_used_if_provided(task, cluster):
[
[
(
lgb
.
DaskLGBMClassifier
,
lgb
.
LGBMClassifier
),
(
lgb
.
DaskLGBMClassifier
,
lgb
.
LGBMClassifier
),
(
lgb
.
DaskLGBMRegressor
,
lgb
.
LGBMRegressor
),
(
lgb
.
DaskLGBMRegressor
,
lgb
.
LGBMRegressor
),
(
lgb
.
DaskLGBMRanker
,
lgb
.
LGBMRanker
)
(
lgb
.
DaskLGBMRanker
,
lgb
.
LGBMRanker
)
,
]
]
,
)
)
def
test_dask_classes_and_sklearn_equivalents_have_identical_constructors_except_client_arg
(
classes
):
def
test_dask_classes_and_sklearn_equivalents_have_identical_constructors_except_client_arg
(
classes
):
dask_spec
=
inspect
.
getfullargspec
(
classes
[
0
])
dask_spec
=
inspect
.
getfullargspec
(
classes
[
0
])
...
@@ -1655,7 +1388,7 @@ def test_dask_classes_and_sklearn_equivalents_have_identical_constructors_except
...
@@ -1655,7 +1388,7 @@ def test_dask_classes_and_sklearn_equivalents_have_identical_constructors_except
# "client" should be the only different, and the final argument
# "client" should be the only different, and the final argument
assert
dask_spec
.
args
[:
-
1
]
==
sklearn_spec
.
args
assert
dask_spec
.
args
[:
-
1
]
==
sklearn_spec
.
args
assert
dask_spec
.
defaults
[:
-
1
]
==
sklearn_spec
.
defaults
assert
dask_spec
.
defaults
[:
-
1
]
==
sklearn_spec
.
defaults
assert
dask_spec
.
args
[
-
1
]
==
'
client
'
assert
dask_spec
.
args
[
-
1
]
==
"
client
"
assert
dask_spec
.
defaults
[
-
1
]
is
None
assert
dask_spec
.
defaults
[
-
1
]
is
None
...
@@ -1668,18 +1401,18 @@ def test_dask_classes_and_sklearn_equivalents_have_identical_constructors_except
...
@@ -1668,18 +1401,18 @@ def test_dask_classes_and_sklearn_equivalents_have_identical_constructors_except
(
lgb
.
DaskLGBMRegressor
.
fit
,
lgb
.
LGBMRegressor
.
fit
),
(
lgb
.
DaskLGBMRegressor
.
fit
,
lgb
.
LGBMRegressor
.
fit
),
(
lgb
.
DaskLGBMRegressor
.
predict
,
lgb
.
LGBMRegressor
.
predict
),
(
lgb
.
DaskLGBMRegressor
.
predict
,
lgb
.
LGBMRegressor
.
predict
),
(
lgb
.
DaskLGBMRanker
.
fit
,
lgb
.
LGBMRanker
.
fit
),
(
lgb
.
DaskLGBMRanker
.
fit
,
lgb
.
LGBMRanker
.
fit
),
(
lgb
.
DaskLGBMRanker
.
predict
,
lgb
.
LGBMRanker
.
predict
)
(
lgb
.
DaskLGBMRanker
.
predict
,
lgb
.
LGBMRanker
.
predict
)
,
]
]
,
)
)
def
test_dask_methods_and_sklearn_equivalents_have_similar_signatures
(
methods
):
def
test_dask_methods_and_sklearn_equivalents_have_similar_signatures
(
methods
):
dask_spec
=
inspect
.
getfullargspec
(
methods
[
0
])
dask_spec
=
inspect
.
getfullargspec
(
methods
[
0
])
sklearn_spec
=
inspect
.
getfullargspec
(
methods
[
1
])
sklearn_spec
=
inspect
.
getfullargspec
(
methods
[
1
])
dask_params
=
inspect
.
signature
(
methods
[
0
]).
parameters
dask_params
=
inspect
.
signature
(
methods
[
0
]).
parameters
sklearn_params
=
inspect
.
signature
(
methods
[
1
]).
parameters
sklearn_params
=
inspect
.
signature
(
methods
[
1
]).
parameters
assert
dask_spec
.
args
==
sklearn_spec
.
args
[:
len
(
dask_spec
.
args
)]
assert
dask_spec
.
args
==
sklearn_spec
.
args
[:
len
(
dask_spec
.
args
)]
assert
dask_spec
.
varargs
==
sklearn_spec
.
varargs
assert
dask_spec
.
varargs
==
sklearn_spec
.
varargs
if
sklearn_spec
.
varkw
:
if
sklearn_spec
.
varkw
:
assert
dask_spec
.
varkw
==
sklearn_spec
.
varkw
[:
len
(
dask_spec
.
varkw
)]
assert
dask_spec
.
varkw
==
sklearn_spec
.
varkw
[:
len
(
dask_spec
.
varkw
)]
assert
dask_spec
.
kwonlyargs
==
sklearn_spec
.
kwonlyargs
assert
dask_spec
.
kwonlyargs
==
sklearn_spec
.
kwonlyargs
assert
dask_spec
.
kwonlydefaults
==
sklearn_spec
.
kwonlydefaults
assert
dask_spec
.
kwonlydefaults
==
sklearn_spec
.
kwonlydefaults
for
param
in
dask_spec
.
args
:
for
param
in
dask_spec
.
args
:
...
@@ -1687,14 +1420,10 @@ def test_dask_methods_and_sklearn_equivalents_have_similar_signatures(methods):
...
@@ -1687,14 +1420,10 @@ def test_dask_methods_and_sklearn_equivalents_have_similar_signatures(methods):
assert
dask_params
[
param
].
default
==
sklearn_params
[
param
].
default
,
error_msg
assert
dask_params
[
param
].
default
==
sklearn_params
[
param
].
default
,
error_msg
@
pytest
.
mark
.
parametrize
(
'
task
'
,
tasks
)
@
pytest
.
mark
.
parametrize
(
"
task
"
,
tasks
)
def
test_training_succeeds_when_data_is_dataframe_and_label_is_column_array
(
task
,
cluster
):
def
test_training_succeeds_when_data_is_dataframe_and_label_is_column_array
(
task
,
cluster
):
with
Client
(
cluster
):
with
Client
(
cluster
):
_
,
_
,
_
,
_
,
dX
,
dy
,
dw
,
dg
=
_create_data
(
_
,
_
,
_
,
_
,
dX
,
dy
,
dw
,
dg
=
_create_data
(
objective
=
task
,
output
=
"dataframe"
,
group
=
None
)
objective
=
task
,
output
=
'dataframe'
,
group
=
None
)
model_factory
=
task_to_dask_factory
[
task
]
model_factory
=
task_to_dask_factory
[
task
]
...
@@ -1702,58 +1431,41 @@ def test_training_succeeds_when_data_is_dataframe_and_label_is_column_array(task
...
@@ -1702,58 +1431,41 @@ def test_training_succeeds_when_data_is_dataframe_and_label_is_column_array(task
dy_col_array
=
dy
.
reshape
(
-
1
,
1
)
dy_col_array
=
dy
.
reshape
(
-
1
,
1
)
assert
len
(
dy_col_array
.
shape
)
==
2
and
dy_col_array
.
shape
[
1
]
==
1
assert
len
(
dy_col_array
.
shape
)
==
2
and
dy_col_array
.
shape
[
1
]
==
1
params
=
{
params
=
{
"n_estimators"
:
1
,
"num_leaves"
:
3
,
"random_state"
:
0
,
"time_out"
:
5
}
'n_estimators'
:
1
,
'num_leaves'
:
3
,
'random_state'
:
0
,
'time_out'
:
5
}
model
=
model_factory
(
**
params
)
model
=
model_factory
(
**
params
)
model
.
fit
(
dX
,
dy_col_array
,
sample_weight
=
dw
,
group
=
dg
)
model
.
fit
(
dX
,
dy_col_array
,
sample_weight
=
dw
,
group
=
dg
)
assert
model
.
fitted_
assert
model
.
fitted_
@
pytest
.
mark
.
parametrize
(
'
task
'
,
tasks
)
@
pytest
.
mark
.
parametrize
(
"
task
"
,
tasks
)
@
pytest
.
mark
.
parametrize
(
'
output
'
,
data_output
)
@
pytest
.
mark
.
parametrize
(
"
output
"
,
data_output
)
def
test_init_score
(
task
,
output
,
cluster
):
def
test_init_score
(
task
,
output
,
cluster
):
if
task
==
'
ranking
'
and
output
==
'
scipy_csr_matrix
'
:
if
task
==
"
ranking
"
and
output
==
"
scipy_csr_matrix
"
:
pytest
.
skip
(
'
LGBMRanker is not currently tested on sparse matrices
'
)
pytest
.
skip
(
"
LGBMRanker is not currently tested on sparse matrices
"
)
with
Client
(
cluster
)
as
client
:
with
Client
(
cluster
)
as
client
:
_
,
_
,
_
,
_
,
dX
,
dy
,
dw
,
dg
=
_create_data
(
_
,
_
,
_
,
_
,
dX
,
dy
,
dw
,
dg
=
_create_data
(
objective
=
task
,
output
=
output
,
group
=
None
)
objective
=
task
,
output
=
output
,
group
=
None
)
model_factory
=
task_to_dask_factory
[
task
]
model_factory
=
task_to_dask_factory
[
task
]
params
=
{
params
=
{
"n_estimators"
:
1
,
"num_leaves"
:
2
,
"time_out"
:
5
}
'n_estimators'
:
1
,
'num_leaves'
:
2
,
'time_out'
:
5
}
init_score
=
random
.
random
()
init_score
=
random
.
random
()
size_factor
=
1
size_factor
=
1
if
task
==
'
multiclass-classification
'
:
if
task
==
"
multiclass-classification
"
:
size_factor
=
3
# number of classes
size_factor
=
3
# number of classes
if
output
.
startswith
(
'
dataframe
'
):
if
output
.
startswith
(
"
dataframe
"
):
init_scores
=
dy
.
map_partitions
(
lambda
x
:
pd
.
DataFrame
([[
init_score
]
*
size_factor
]
*
x
.
size
))
init_scores
=
dy
.
map_partitions
(
lambda
x
:
pd
.
DataFrame
([[
init_score
]
*
size_factor
]
*
x
.
size
))
else
:
else
:
init_scores
=
dy
.
map_blocks
(
lambda
x
:
np
.
full
((
x
.
size
,
size_factor
),
init_score
))
init_scores
=
dy
.
map_blocks
(
lambda
x
:
np
.
full
((
x
.
size
,
size_factor
),
init_score
))
model
=
model_factory
(
client
=
client
,
**
params
)
model
=
model_factory
(
client
=
client
,
**
params
)
model
.
fit
(
dX
,
dy
,
sample_weight
=
dw
,
init_score
=
init_scores
,
group
=
dg
)
model
.
fit
(
dX
,
dy
,
sample_weight
=
dw
,
init_score
=
init_scores
,
group
=
dg
)
# value of the root node is 0 when init_score is set
# value of the root node is 0 when init_score is set
assert
model
.
booster_
.
trees_to_dataframe
()[
'
value
'
][
0
]
==
0
assert
model
.
booster_
.
trees_to_dataframe
()[
"
value
"
][
0
]
==
0
def
sklearn_checks_to_run
():
def
sklearn_checks_to_run
():
check_names
=
[
check_names
=
[
"check_estimator_get_tags_default_keys"
,
"check_get_params_invariance"
,
"check_set_params"
]
"check_estimator_get_tags_default_keys"
,
"check_get_params_invariance"
,
"check_set_params"
]
for
check_name
in
check_names
:
for
check_name
in
check_names
:
check_func
=
getattr
(
sklearn_checks
,
check_name
,
None
)
check_func
=
getattr
(
sklearn_checks
,
check_name
,
None
)
if
check_func
:
if
check_func
:
...
@@ -1782,79 +1494,58 @@ def test_parameters_default_constructible(estimator):
...
@@ -1782,79 +1494,58 @@ def test_parameters_default_constructible(estimator):
sklearn_checks
.
check_parameters_default_constructible
(
name
,
Estimator
)
sklearn_checks
.
check_parameters_default_constructible
(
name
,
Estimator
)
@
pytest
.
mark
.
parametrize
(
'
task
'
,
tasks
)
@
pytest
.
mark
.
parametrize
(
"
task
"
,
tasks
)
@
pytest
.
mark
.
parametrize
(
'
output
'
,
data_output
)
@
pytest
.
mark
.
parametrize
(
"
output
"
,
data_output
)
def
test_predict_with_raw_score
(
task
,
output
,
cluster
):
def
test_predict_with_raw_score
(
task
,
output
,
cluster
):
if
task
==
'
ranking
'
and
output
==
'
scipy_csr_matrix
'
:
if
task
==
"
ranking
"
and
output
==
"
scipy_csr_matrix
"
:
pytest
.
skip
(
'
LGBMRanker is not currently tested on sparse matrices
'
)
pytest
.
skip
(
"
LGBMRanker is not currently tested on sparse matrices
"
)
with
Client
(
cluster
)
as
client
:
with
Client
(
cluster
)
as
client
:
_
,
_
,
_
,
_
,
dX
,
dy
,
_
,
dg
=
_create_data
(
_
,
_
,
_
,
_
,
dX
,
dy
,
_
,
dg
=
_create_data
(
objective
=
task
,
output
=
output
,
group
=
None
)
objective
=
task
,
output
=
output
,
group
=
None
)
model_factory
=
task_to_dask_factory
[
task
]
model_factory
=
task_to_dask_factory
[
task
]
params
=
{
params
=
{
"client"
:
client
,
"n_estimators"
:
1
,
"num_leaves"
:
2
,
"time_out"
:
5
,
"min_sum_hessian"
:
0
}
'client'
:
client
,
'n_estimators'
:
1
,
'num_leaves'
:
2
,
'time_out'
:
5
,
'min_sum_hessian'
:
0
}
model
=
model_factory
(
**
params
)
model
=
model_factory
(
**
params
)
model
.
fit
(
dX
,
dy
,
group
=
dg
)
model
.
fit
(
dX
,
dy
,
group
=
dg
)
raw_predictions
=
model
.
predict
(
dX
,
raw_score
=
True
).
compute
()
raw_predictions
=
model
.
predict
(
dX
,
raw_score
=
True
).
compute
()
trees_df
=
model
.
booster_
.
trees_to_dataframe
()
trees_df
=
model
.
booster_
.
trees_to_dataframe
()
leaves_df
=
trees_df
[
trees_df
.
node_depth
==
2
]
leaves_df
=
trees_df
[
trees_df
.
node_depth
==
2
]
if
task
==
'
multiclass-classification
'
:
if
task
==
"
multiclass-classification
"
:
for
i
in
range
(
model
.
n_classes_
):
for
i
in
range
(
model
.
n_classes_
):
class_df
=
leaves_df
[
leaves_df
.
tree_index
==
i
]
class_df
=
leaves_df
[
leaves_df
.
tree_index
==
i
]
assert
set
(
raw_predictions
[:,
i
])
==
set
(
class_df
[
'
value
'
])
assert
set
(
raw_predictions
[:,
i
])
==
set
(
class_df
[
"
value
"
])
else
:
else
:
assert
set
(
raw_predictions
)
==
set
(
leaves_df
[
'
value
'
])
assert
set
(
raw_predictions
)
==
set
(
leaves_df
[
"
value
"
])
if
task
.
endswith
(
'
classification
'
):
if
task
.
endswith
(
"
classification
"
):
pred_proba_raw
=
model
.
predict_proba
(
dX
,
raw_score
=
True
).
compute
()
pred_proba_raw
=
model
.
predict_proba
(
dX
,
raw_score
=
True
).
compute
()
assert_eq
(
raw_predictions
,
pred_proba_raw
)
assert_eq
(
raw_predictions
,
pred_proba_raw
)
def
test_distributed_quantized_training
(
cluster
):
def
test_distributed_quantized_training
(
cluster
):
with
Client
(
cluster
)
as
client
:
with
Client
(
cluster
)
as
client
:
X
,
y
,
w
,
_
,
dX
,
dy
,
dw
,
_
=
_create_data
(
X
,
y
,
w
,
_
,
dX
,
dy
,
dw
,
_
=
_create_data
(
objective
=
"regression"
,
output
=
"array"
)
objective
=
'regression'
,
output
=
'array'
)
np
.
savetxt
(
"data_dask.csv"
,
np
.
hstack
([
np
.
array
([
y
]).
T
,
X
]),
fmt
=
"%f,%f,%f,%f,%f"
)
np
.
savetxt
(
"data_dask.csv"
,
np
.
hstack
([
np
.
array
([
y
]).
T
,
X
]),
fmt
=
"%f,%f,%f,%f,%f"
)
params
=
{
params
=
{
"boosting_type"
:
'
gbdt
'
,
"boosting_type"
:
"
gbdt
"
,
"n_estimators"
:
50
,
"n_estimators"
:
50
,
"num_leaves"
:
31
,
"num_leaves"
:
31
,
'
use_quantized_grad
'
:
True
,
"
use_quantized_grad
"
:
True
,
'
num_grad_quant_bins
'
:
30
,
"
num_grad_quant_bins
"
:
30
,
'
quant_train_renew_leaf
'
:
True
,
"
quant_train_renew_leaf
"
:
True
,
'
verbose
'
:
-
1
,
"
verbose
"
:
-
1
,
}
}
quant_dask_classifier
=
lgb
.
DaskLGBMRegressor
(
quant_dask_classifier
=
lgb
.
DaskLGBMRegressor
(
client
=
client
,
time_out
=
5
,
**
params
)
client
=
client
,
time_out
=
5
,
**
params
)
quant_dask_classifier
=
quant_dask_classifier
.
fit
(
dX
,
dy
,
sample_weight
=
dw
)
quant_dask_classifier
=
quant_dask_classifier
.
fit
(
dX
,
dy
,
sample_weight
=
dw
)
quant_p1
=
quant_dask_classifier
.
predict
(
dX
)
quant_p1
=
quant_dask_classifier
.
predict
(
dX
)
quant_rmse
=
np
.
sqrt
(
np
.
mean
((
quant_p1
.
compute
()
-
y
)
**
2
))
quant_rmse
=
np
.
sqrt
(
np
.
mean
((
quant_p1
.
compute
()
-
y
)
**
2
))
params
[
"use_quantized_grad"
]
=
False
params
[
"use_quantized_grad"
]
=
False
dask_classifier
=
lgb
.
DaskLGBMRegressor
(
dask_classifier
=
lgb
.
DaskLGBMRegressor
(
client
=
client
,
time_out
=
5
,
**
params
)
client
=
client
,
time_out
=
5
,
**
params
)
dask_classifier
=
dask_classifier
.
fit
(
dX
,
dy
,
sample_weight
=
dw
)
dask_classifier
=
dask_classifier
.
fit
(
dX
,
dy
,
sample_weight
=
dw
)
p1
=
dask_classifier
.
predict
(
dX
)
p1
=
dask_classifier
.
predict
(
dX
)
rmse
=
np
.
sqrt
(
np
.
mean
((
p1
.
compute
()
-
y
)
**
2
))
rmse
=
np
.
sqrt
(
np
.
mean
((
p1
.
compute
()
-
y
)
**
2
))
...
...
tests/python_package_test/test_dual.py
View file @
1b792e71
...
@@ -28,7 +28,7 @@ def test_cpu_and_gpu_work():
...
@@ -28,7 +28,7 @@ def test_cpu_and_gpu_work():
params_gpu
=
params_cpu
.
copy
()
params_gpu
=
params_cpu
.
copy
()
params_gpu
[
"device"
]
=
"gpu"
params_gpu
[
"device"
]
=
"gpu"
# Double-precision floats are only supported on x86_64 with PoCL
# Double-precision floats are only supported on x86_64 with PoCL
params_gpu
[
"gpu_use_dp"
]
=
(
platform
.
machine
()
==
"x86_64"
)
params_gpu
[
"gpu_use_dp"
]
=
platform
.
machine
()
==
"x86_64"
gpu_bst
=
lgb
.
train
(
params_gpu
,
data
,
num_boost_round
=
10
)
gpu_bst
=
lgb
.
train
(
params_gpu
,
data
,
num_boost_round
=
10
)
gpu_score
=
log_loss
(
y
,
gpu_bst
.
predict
(
X
))
gpu_score
=
log_loss
(
y
,
gpu_bst
.
predict
(
X
))
...
...
tests/python_package_test/test_engine.py
View file @
1b792e71
This source diff could not be displayed because it is too large. You can
view the blob
instead.
tests/python_package_test/test_plotting.py
View file @
1b792e71
...
@@ -9,7 +9,8 @@ from lightgbm.compat import GRAPHVIZ_INSTALLED, MATPLOTLIB_INSTALLED, PANDAS_INS
...
@@ -9,7 +9,8 @@ from lightgbm.compat import GRAPHVIZ_INSTALLED, MATPLOTLIB_INSTALLED, PANDAS_INS
if
MATPLOTLIB_INSTALLED
:
if
MATPLOTLIB_INSTALLED
:
import
matplotlib
import
matplotlib
matplotlib
.
use
(
'Agg'
)
matplotlib
.
use
(
"Agg"
)
if
GRAPHVIZ_INSTALLED
:
if
GRAPHVIZ_INSTALLED
:
import
graphviz
import
graphviz
...
@@ -18,8 +19,7 @@ from .utils import load_breast_cancer, make_synthetic_regression
...
@@ -18,8 +19,7 @@ from .utils import load_breast_cancer, make_synthetic_regression
@
pytest
.
fixture
(
scope
=
"module"
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
breast_cancer_split
():
def
breast_cancer_split
():
return
train_test_split
(
*
load_breast_cancer
(
return_X_y
=
True
),
return
train_test_split
(
*
load_breast_cancer
(
return_X_y
=
True
),
test_size
=
0.1
,
random_state
=
1
)
test_size
=
0.1
,
random_state
=
1
)
def
_categorical_data
(
category_values_lower_bound
,
category_values_upper_bound
):
def
_categorical_data
(
category_values_lower_bound
,
category_values_upper_bound
):
...
@@ -41,51 +41,51 @@ def train_data(breast_cancer_split):
...
@@ -41,51 +41,51 @@ def train_data(breast_cancer_split):
@
pytest
.
fixture
@
pytest
.
fixture
def
params
():
def
params
():
return
{
"objective"
:
"binary"
,
return
{
"objective"
:
"binary"
,
"verbose"
:
-
1
,
"num_leaves"
:
3
}
"verbose"
:
-
1
,
"num_leaves"
:
3
}
@
pytest
.
mark
.
skipif
(
not
MATPLOTLIB_INSTALLED
,
reason
=
'
matplotlib is not installed
'
)
@
pytest
.
mark
.
skipif
(
not
MATPLOTLIB_INSTALLED
,
reason
=
"
matplotlib is not installed
"
)
def
test_plot_importance
(
params
,
breast_cancer_split
,
train_data
):
def
test_plot_importance
(
params
,
breast_cancer_split
,
train_data
):
X_train
,
_
,
y_train
,
_
=
breast_cancer_split
X_train
,
_
,
y_train
,
_
=
breast_cancer_split
gbm0
=
lgb
.
train
(
params
,
train_data
,
num_boost_round
=
10
)
gbm0
=
lgb
.
train
(
params
,
train_data
,
num_boost_round
=
10
)
ax0
=
lgb
.
plot_importance
(
gbm0
)
ax0
=
lgb
.
plot_importance
(
gbm0
)
assert
isinstance
(
ax0
,
matplotlib
.
axes
.
Axes
)
assert
isinstance
(
ax0
,
matplotlib
.
axes
.
Axes
)
assert
ax0
.
get_title
()
==
'
Feature importance
'
assert
ax0
.
get_title
()
==
"
Feature importance
"
assert
ax0
.
get_xlabel
()
==
'
Feature importance
'
assert
ax0
.
get_xlabel
()
==
"
Feature importance
"
assert
ax0
.
get_ylabel
()
==
'
Features
'
assert
ax0
.
get_ylabel
()
==
"
Features
"
assert
len
(
ax0
.
patches
)
<=
30
assert
len
(
ax0
.
patches
)
<=
30
gbm1
=
lgb
.
LGBMClassifier
(
n_estimators
=
10
,
num_leaves
=
3
,
verbose
=-
1
)
gbm1
=
lgb
.
LGBMClassifier
(
n_estimators
=
10
,
num_leaves
=
3
,
verbose
=-
1
)
gbm1
.
fit
(
X_train
,
y_train
)
gbm1
.
fit
(
X_train
,
y_train
)
ax1
=
lgb
.
plot_importance
(
gbm1
,
color
=
'r'
,
title
=
't'
,
xlabel
=
'x'
,
ylabel
=
'y'
)
ax1
=
lgb
.
plot_importance
(
gbm1
,
color
=
"r"
,
title
=
"t"
,
xlabel
=
"x"
,
ylabel
=
"y"
)
assert
isinstance
(
ax1
,
matplotlib
.
axes
.
Axes
)
assert
isinstance
(
ax1
,
matplotlib
.
axes
.
Axes
)
assert
ax1
.
get_title
()
==
't'
assert
ax1
.
get_title
()
==
"t"
assert
ax1
.
get_xlabel
()
==
'x'
assert
ax1
.
get_xlabel
()
==
"x"
assert
ax1
.
get_ylabel
()
==
'y'
assert
ax1
.
get_ylabel
()
==
"y"
assert
len
(
ax1
.
patches
)
<=
30
assert
len
(
ax1
.
patches
)
<=
30
for
patch
in
ax1
.
patches
:
for
patch
in
ax1
.
patches
:
assert
patch
.
get_facecolor
()
==
(
1.
,
0
,
0
,
1.
)
# red
assert
patch
.
get_facecolor
()
==
(
1.
0
,
0
,
0
,
1.
0
)
# red
ax2
=
lgb
.
plot_importance
(
gbm0
,
color
=
[
'r'
,
'y'
,
'g'
,
'b'
],
title
=
None
,
xlabel
=
None
,
ylabel
=
None
)
ax2
=
lgb
.
plot_importance
(
gbm0
,
color
=
[
"r"
,
"y"
,
"g"
,
"b"
],
title
=
None
,
xlabel
=
None
,
ylabel
=
None
)
assert
isinstance
(
ax2
,
matplotlib
.
axes
.
Axes
)
assert
isinstance
(
ax2
,
matplotlib
.
axes
.
Axes
)
assert
ax2
.
get_title
()
==
''
assert
ax2
.
get_title
()
==
""
assert
ax2
.
get_xlabel
()
==
''
assert
ax2
.
get_xlabel
()
==
""
assert
ax2
.
get_ylabel
()
==
''
assert
ax2
.
get_ylabel
()
==
""
assert
len
(
ax2
.
patches
)
<=
30
assert
len
(
ax2
.
patches
)
<=
30
assert
ax2
.
patches
[
0
].
get_facecolor
()
==
(
1.
,
0
,
0
,
1.
)
# r
assert
ax2
.
patches
[
0
].
get_facecolor
()
==
(
1.
0
,
0
,
0
,
1.
0
)
# r
assert
ax2
.
patches
[
1
].
get_facecolor
()
==
(.
75
,
.
75
,
0
,
1.
)
# y
assert
ax2
.
patches
[
1
].
get_facecolor
()
==
(
0
.75
,
0
.75
,
0
,
1.
0
)
# y
assert
ax2
.
patches
[
2
].
get_facecolor
()
==
(
0
,
.
5
,
0
,
1.
)
# g
assert
ax2
.
patches
[
2
].
get_facecolor
()
==
(
0
,
0
.5
,
0
,
1.
0
)
# g
assert
ax2
.
patches
[
3
].
get_facecolor
()
==
(
0
,
0
,
1.
,
1.
)
# b
assert
ax2
.
patches
[
3
].
get_facecolor
()
==
(
0
,
0
,
1.
0
,
1.
0
)
# b
ax3
=
lgb
.
plot_importance
(
gbm0
,
title
=
't @importance_type@'
,
xlabel
=
'x @importance_type@'
,
ylabel
=
'y @importance_type@'
)
ax3
=
lgb
.
plot_importance
(
gbm0
,
title
=
"t @importance_type@"
,
xlabel
=
"x @importance_type@"
,
ylabel
=
"y @importance_type@"
)
assert
isinstance
(
ax3
,
matplotlib
.
axes
.
Axes
)
assert
isinstance
(
ax3
,
matplotlib
.
axes
.
Axes
)
assert
ax3
.
get_title
()
==
'
t @importance_type@
'
assert
ax3
.
get_title
()
==
"
t @importance_type@
"
assert
ax3
.
get_xlabel
()
==
'
x split
'
assert
ax3
.
get_xlabel
()
==
"
x split
"
assert
ax3
.
get_ylabel
()
==
'
y @importance_type@
'
assert
ax3
.
get_ylabel
()
==
"
y @importance_type@
"
assert
len
(
ax3
.
patches
)
<=
30
assert
len
(
ax3
.
patches
)
<=
30
gbm2
=
lgb
.
LGBMClassifier
(
n_estimators
=
10
,
num_leaves
=
3
,
verbose
=-
1
,
importance_type
=
"gain"
)
gbm2
=
lgb
.
LGBMClassifier
(
n_estimators
=
10
,
num_leaves
=
3
,
verbose
=-
1
,
importance_type
=
"gain"
)
...
@@ -108,51 +108,59 @@ def test_plot_importance(params, breast_cancer_split, train_data):
...
@@ -108,51 +108,59 @@ def test_plot_importance(params, breast_cancer_split, train_data):
assert
first_bar1
!=
first_bar3
assert
first_bar1
!=
first_bar3
@
pytest
.
mark
.
skipif
(
not
MATPLOTLIB_INSTALLED
,
reason
=
'
matplotlib is not installed
'
)
@
pytest
.
mark
.
skipif
(
not
MATPLOTLIB_INSTALLED
,
reason
=
"
matplotlib is not installed
"
)
def
test_plot_split_value_histogram
(
params
,
breast_cancer_split
,
train_data
):
def
test_plot_split_value_histogram
(
params
,
breast_cancer_split
,
train_data
):
X_train
,
_
,
y_train
,
_
=
breast_cancer_split
X_train
,
_
,
y_train
,
_
=
breast_cancer_split
gbm0
=
lgb
.
train
(
params
,
train_data
,
num_boost_round
=
10
)
gbm0
=
lgb
.
train
(
params
,
train_data
,
num_boost_round
=
10
)
ax0
=
lgb
.
plot_split_value_histogram
(
gbm0
,
27
)
ax0
=
lgb
.
plot_split_value_histogram
(
gbm0
,
27
)
assert
isinstance
(
ax0
,
matplotlib
.
axes
.
Axes
)
assert
isinstance
(
ax0
,
matplotlib
.
axes
.
Axes
)
assert
ax0
.
get_title
()
==
'
Split value histogram for feature with index 27
'
assert
ax0
.
get_title
()
==
"
Split value histogram for feature with index 27
"
assert
ax0
.
get_xlabel
()
==
'
Feature split value
'
assert
ax0
.
get_xlabel
()
==
"
Feature split value
"
assert
ax0
.
get_ylabel
()
==
'
Count
'
assert
ax0
.
get_ylabel
()
==
"
Count
"
assert
len
(
ax0
.
patches
)
<=
2
assert
len
(
ax0
.
patches
)
<=
2
gbm1
=
lgb
.
LGBMClassifier
(
n_estimators
=
10
,
num_leaves
=
3
,
verbose
=-
1
)
gbm1
=
lgb
.
LGBMClassifier
(
n_estimators
=
10
,
num_leaves
=
3
,
verbose
=-
1
)
gbm1
.
fit
(
X_train
,
y_train
)
gbm1
.
fit
(
X_train
,
y_train
)
ax1
=
lgb
.
plot_split_value_histogram
(
gbm1
,
gbm1
.
booster_
.
feature_name
()[
27
],
figsize
=
(
10
,
5
),
ax1
=
lgb
.
plot_split_value_histogram
(
title
=
'Histogram for feature @index/name@ @feature@'
,
gbm1
,
xlabel
=
'x'
,
ylabel
=
'y'
,
color
=
'r'
)
gbm1
.
booster_
.
feature_name
()[
27
],
figsize
=
(
10
,
5
),
title
=
"Histogram for feature @index/name@ @feature@"
,
xlabel
=
"x"
,
ylabel
=
"y"
,
color
=
"r"
,
)
assert
isinstance
(
ax1
,
matplotlib
.
axes
.
Axes
)
assert
isinstance
(
ax1
,
matplotlib
.
axes
.
Axes
)
title
=
f
'
Histogram for feature name
{
gbm1
.
booster_
.
feature_name
()[
27
]
}
'
title
=
f
"
Histogram for feature name
{
gbm1
.
booster_
.
feature_name
()[
27
]
}
"
assert
ax1
.
get_title
()
==
title
assert
ax1
.
get_title
()
==
title
assert
ax1
.
get_xlabel
()
==
'x'
assert
ax1
.
get_xlabel
()
==
"x"
assert
ax1
.
get_ylabel
()
==
'y'
assert
ax1
.
get_ylabel
()
==
"y"
assert
len
(
ax1
.
patches
)
<=
2
assert
len
(
ax1
.
patches
)
<=
2
for
patch
in
ax1
.
patches
:
for
patch
in
ax1
.
patches
:
assert
patch
.
get_facecolor
()
==
(
1.
,
0
,
0
,
1.
)
# red
assert
patch
.
get_facecolor
()
==
(
1.
0
,
0
,
0
,
1.
0
)
# red
ax2
=
lgb
.
plot_split_value_histogram
(
gbm0
,
27
,
bins
=
10
,
color
=
[
'r'
,
'y'
,
'g'
,
'b'
],
ax2
=
lgb
.
plot_split_value_histogram
(
title
=
None
,
xlabel
=
None
,
ylabel
=
None
)
gbm0
,
27
,
bins
=
10
,
color
=
[
"r"
,
"y"
,
"g"
,
"b"
],
title
=
None
,
xlabel
=
None
,
ylabel
=
None
)
assert
isinstance
(
ax2
,
matplotlib
.
axes
.
Axes
)
assert
isinstance
(
ax2
,
matplotlib
.
axes
.
Axes
)
assert
ax2
.
get_title
()
==
''
assert
ax2
.
get_title
()
==
""
assert
ax2
.
get_xlabel
()
==
''
assert
ax2
.
get_xlabel
()
==
""
assert
ax2
.
get_ylabel
()
==
''
assert
ax2
.
get_ylabel
()
==
""
assert
len
(
ax2
.
patches
)
==
10
assert
len
(
ax2
.
patches
)
==
10
assert
ax2
.
patches
[
0
].
get_facecolor
()
==
(
1.
,
0
,
0
,
1.
)
# r
assert
ax2
.
patches
[
0
].
get_facecolor
()
==
(
1.
0
,
0
,
0
,
1.
0
)
# r
assert
ax2
.
patches
[
1
].
get_facecolor
()
==
(.
75
,
.
75
,
0
,
1.
)
# y
assert
ax2
.
patches
[
1
].
get_facecolor
()
==
(
0
.75
,
0
.75
,
0
,
1.
0
)
# y
assert
ax2
.
patches
[
2
].
get_facecolor
()
==
(
0
,
.
5
,
0
,
1.
)
# g
assert
ax2
.
patches
[
2
].
get_facecolor
()
==
(
0
,
0
.5
,
0
,
1.
0
)
# g
assert
ax2
.
patches
[
3
].
get_facecolor
()
==
(
0
,
0
,
1.
,
1.
)
# b
assert
ax2
.
patches
[
3
].
get_facecolor
()
==
(
0
,
0
,
1.
0
,
1.
0
)
# b
with
pytest
.
raises
(
ValueError
):
with
pytest
.
raises
(
ValueError
):
lgb
.
plot_split_value_histogram
(
gbm0
,
0
)
# was not used in splitting
lgb
.
plot_split_value_histogram
(
gbm0
,
0
)
# was not used in splitting
@
pytest
.
mark
.
skipif
(
not
MATPLOTLIB_INSTALLED
or
not
GRAPHVIZ_INSTALLED
,
@
pytest
.
mark
.
skipif
(
reason
=
'matplotlib or graphviz is not installed'
)
not
MATPLOTLIB_INSTALLED
or
not
GRAPHVIZ_INSTALLED
,
reason
=
"matplotlib or graphviz is not installed"
)
def
test_plot_tree
(
breast_cancer_split
):
def
test_plot_tree
(
breast_cancer_split
):
X_train
,
_
,
y_train
,
_
=
breast_cancer_split
X_train
,
_
,
y_train
,
_
=
breast_cancer_split
gbm
=
lgb
.
LGBMClassifier
(
n_estimators
=
10
,
num_leaves
=
3
,
verbose
=-
1
)
gbm
=
lgb
.
LGBMClassifier
(
n_estimators
=
10
,
num_leaves
=
3
,
verbose
=-
1
)
...
@@ -161,14 +169,14 @@ def test_plot_tree(breast_cancer_split):
...
@@ -161,14 +169,14 @@ def test_plot_tree(breast_cancer_split):
with
pytest
.
raises
(
IndexError
):
with
pytest
.
raises
(
IndexError
):
lgb
.
plot_tree
(
gbm
,
tree_index
=
83
)
lgb
.
plot_tree
(
gbm
,
tree_index
=
83
)
ax
=
lgb
.
plot_tree
(
gbm
,
tree_index
=
3
,
figsize
=
(
15
,
8
),
show_info
=
[
'
split_gain
'
])
ax
=
lgb
.
plot_tree
(
gbm
,
tree_index
=
3
,
figsize
=
(
15
,
8
),
show_info
=
[
"
split_gain
"
])
assert
isinstance
(
ax
,
matplotlib
.
axes
.
Axes
)
assert
isinstance
(
ax
,
matplotlib
.
axes
.
Axes
)
w
,
h
=
ax
.
axes
.
get_figure
().
get_size_inches
()
w
,
h
=
ax
.
axes
.
get_figure
().
get_size_inches
()
assert
int
(
w
)
==
15
assert
int
(
w
)
==
15
assert
int
(
h
)
==
8
assert
int
(
h
)
==
8
@
pytest
.
mark
.
skipif
(
not
GRAPHVIZ_INSTALLED
,
reason
=
'
graphviz is not installed
'
)
@
pytest
.
mark
.
skipif
(
not
GRAPHVIZ_INSTALLED
,
reason
=
"
graphviz is not installed
"
)
def
test_create_tree_digraph
(
breast_cancer_split
):
def
test_create_tree_digraph
(
breast_cancer_split
):
X_train
,
_
,
y_train
,
_
=
breast_cancer_split
X_train
,
_
,
y_train
,
_
=
breast_cancer_split
...
@@ -179,28 +187,32 @@ def test_create_tree_digraph(breast_cancer_split):
...
@@ -179,28 +187,32 @@ def test_create_tree_digraph(breast_cancer_split):
with
pytest
.
raises
(
IndexError
):
with
pytest
.
raises
(
IndexError
):
lgb
.
create_tree_digraph
(
gbm
,
tree_index
=
83
)
lgb
.
create_tree_digraph
(
gbm
,
tree_index
=
83
)
graph
=
lgb
.
create_tree_digraph
(
gbm
,
tree_index
=
3
,
graph
=
lgb
.
create_tree_digraph
(
show_info
=
[
'split_gain'
,
'internal_value'
,
'internal_weight'
],
gbm
,
name
=
'Tree4'
,
node_attr
=
{
'color'
:
'red'
})
tree_index
=
3
,
show_info
=
[
"split_gain"
,
"internal_value"
,
"internal_weight"
],
name
=
"Tree4"
,
node_attr
=
{
"color"
:
"red"
},
)
graph
.
render
(
view
=
False
)
graph
.
render
(
view
=
False
)
assert
isinstance
(
graph
,
graphviz
.
Digraph
)
assert
isinstance
(
graph
,
graphviz
.
Digraph
)
assert
graph
.
name
==
'
Tree4
'
assert
graph
.
name
==
"
Tree4
"
assert
len
(
graph
.
node_attr
)
==
1
assert
len
(
graph
.
node_attr
)
==
1
assert
graph
.
node_attr
[
'
color
'
]
==
'
red
'
assert
graph
.
node_attr
[
"
color
"
]
==
"
red
"
assert
len
(
graph
.
graph_attr
)
==
0
assert
len
(
graph
.
graph_attr
)
==
0
assert
len
(
graph
.
edge_attr
)
==
0
assert
len
(
graph
.
edge_attr
)
==
0
graph_body
=
''
.
join
(
graph
.
body
)
graph_body
=
""
.
join
(
graph
.
body
)
assert
'
leaf
'
in
graph_body
assert
"
leaf
"
in
graph_body
assert
'
gain
'
in
graph_body
assert
"
gain
"
in
graph_body
assert
'
value
'
in
graph_body
assert
"
value
"
in
graph_body
assert
'
weight
'
in
graph_body
assert
"
weight
"
in
graph_body
assert
'
#ffdddd
'
in
graph_body
assert
"
#ffdddd
"
in
graph_body
assert
'
#ddffdd
'
in
graph_body
assert
"
#ddffdd
"
in
graph_body
assert
'
data
'
not
in
graph_body
assert
"
data
"
not
in
graph_body
assert
'
count
'
not
in
graph_body
assert
"
count
"
not
in
graph_body
@
pytest
.
mark
.
skipif
(
not
GRAPHVIZ_INSTALLED
,
reason
=
'
graphviz is not installed
'
)
@
pytest
.
mark
.
skipif
(
not
GRAPHVIZ_INSTALLED
,
reason
=
"
graphviz is not installed
"
)
def
test_tree_with_categories_below_max_category_values
():
def
test_tree_with_categories_below_max_category_values
():
X_train
,
y_train
=
_categorical_data
(
2
,
10
)
X_train
,
y_train
=
_categorical_data
(
2
,
10
)
params
=
{
params
=
{
...
@@ -211,7 +223,7 @@ def test_tree_with_categories_below_max_category_values():
...
@@ -211,7 +223,7 @@ def test_tree_with_categories_below_max_category_values():
"deterministic"
:
True
,
"deterministic"
:
True
,
"num_threads"
:
1
,
"num_threads"
:
1
,
"seed"
:
708
,
"seed"
:
708
,
"verbose"
:
-
1
"verbose"
:
-
1
,
}
}
gbm
=
lgb
.
LGBMClassifier
(
**
params
)
gbm
=
lgb
.
LGBMClassifier
(
**
params
)
gbm
.
fit
(
X_train
,
y_train
)
gbm
.
fit
(
X_train
,
y_train
)
...
@@ -219,28 +231,32 @@ def test_tree_with_categories_below_max_category_values():
...
@@ -219,28 +231,32 @@ def test_tree_with_categories_below_max_category_values():
with
pytest
.
raises
(
IndexError
):
with
pytest
.
raises
(
IndexError
):
lgb
.
create_tree_digraph
(
gbm
,
tree_index
=
83
)
lgb
.
create_tree_digraph
(
gbm
,
tree_index
=
83
)
graph
=
lgb
.
create_tree_digraph
(
gbm
,
tree_index
=
3
,
graph
=
lgb
.
create_tree_digraph
(
show_info
=
[
'split_gain'
,
'internal_value'
,
'internal_weight'
],
gbm
,
name
=
'Tree4'
,
node_attr
=
{
'color'
:
'red'
},
tree_index
=
3
,
max_category_values
=
10
)
show_info
=
[
"split_gain"
,
"internal_value"
,
"internal_weight"
],
name
=
"Tree4"
,
node_attr
=
{
"color"
:
"red"
},
max_category_values
=
10
,
)
graph
.
render
(
view
=
False
)
graph
.
render
(
view
=
False
)
assert
isinstance
(
graph
,
graphviz
.
Digraph
)
assert
isinstance
(
graph
,
graphviz
.
Digraph
)
assert
graph
.
name
==
'
Tree4
'
assert
graph
.
name
==
"
Tree4
"
assert
len
(
graph
.
node_attr
)
==
1
assert
len
(
graph
.
node_attr
)
==
1
assert
graph
.
node_attr
[
'
color
'
]
==
'
red
'
assert
graph
.
node_attr
[
"
color
"
]
==
"
red
"
assert
len
(
graph
.
graph_attr
)
==
0
assert
len
(
graph
.
graph_attr
)
==
0
assert
len
(
graph
.
edge_attr
)
==
0
assert
len
(
graph
.
edge_attr
)
==
0
graph_body
=
''
.
join
(
graph
.
body
)
graph_body
=
""
.
join
(
graph
.
body
)
assert
'
leaf
'
in
graph_body
assert
"
leaf
"
in
graph_body
assert
'
gain
'
in
graph_body
assert
"
gain
"
in
graph_body
assert
'
value
'
in
graph_body
assert
"
value
"
in
graph_body
assert
'
weight
'
in
graph_body
assert
"
weight
"
in
graph_body
assert
'
data
'
not
in
graph_body
assert
"
data
"
not
in
graph_body
assert
'
count
'
not
in
graph_body
assert
"
count
"
not
in
graph_body
assert
'
||...||
'
not
in
graph_body
assert
"
||...||
"
not
in
graph_body
@
pytest
.
mark
.
skipif
(
not
GRAPHVIZ_INSTALLED
,
reason
=
'
graphviz is not installed
'
)
@
pytest
.
mark
.
skipif
(
not
GRAPHVIZ_INSTALLED
,
reason
=
"
graphviz is not installed
"
)
def
test_tree_with_categories_above_max_category_values
():
def
test_tree_with_categories_above_max_category_values
():
X_train
,
y_train
=
_categorical_data
(
20
,
30
)
X_train
,
y_train
=
_categorical_data
(
20
,
30
)
params
=
{
params
=
{
...
@@ -251,7 +267,7 @@ def test_tree_with_categories_above_max_category_values():
...
@@ -251,7 +267,7 @@ def test_tree_with_categories_above_max_category_values():
"deterministic"
:
True
,
"deterministic"
:
True
,
"num_threads"
:
1
,
"num_threads"
:
1
,
"seed"
:
708
,
"seed"
:
708
,
"verbose"
:
-
1
"verbose"
:
-
1
,
}
}
gbm
=
lgb
.
LGBMClassifier
(
**
params
)
gbm
=
lgb
.
LGBMClassifier
(
**
params
)
gbm
.
fit
(
X_train
,
y_train
)
gbm
.
fit
(
X_train
,
y_train
)
...
@@ -259,32 +275,36 @@ def test_tree_with_categories_above_max_category_values():
...
@@ -259,32 +275,36 @@ def test_tree_with_categories_above_max_category_values():
with
pytest
.
raises
(
IndexError
):
with
pytest
.
raises
(
IndexError
):
lgb
.
create_tree_digraph
(
gbm
,
tree_index
=
83
)
lgb
.
create_tree_digraph
(
gbm
,
tree_index
=
83
)
graph
=
lgb
.
create_tree_digraph
(
gbm
,
tree_index
=
9
,
graph
=
lgb
.
create_tree_digraph
(
show_info
=
[
'split_gain'
,
'internal_value'
,
'internal_weight'
],
gbm
,
name
=
'Tree4'
,
node_attr
=
{
'color'
:
'red'
},
tree_index
=
9
,
max_category_values
=
4
)
show_info
=
[
"split_gain"
,
"internal_value"
,
"internal_weight"
],
name
=
"Tree4"
,
node_attr
=
{
"color"
:
"red"
},
max_category_values
=
4
,
)
graph
.
render
(
view
=
False
)
graph
.
render
(
view
=
False
)
assert
isinstance
(
graph
,
graphviz
.
Digraph
)
assert
isinstance
(
graph
,
graphviz
.
Digraph
)
assert
graph
.
name
==
'
Tree4
'
assert
graph
.
name
==
"
Tree4
"
assert
len
(
graph
.
node_attr
)
==
1
assert
len
(
graph
.
node_attr
)
==
1
assert
graph
.
node_attr
[
'
color
'
]
==
'
red
'
assert
graph
.
node_attr
[
"
color
"
]
==
"
red
"
assert
len
(
graph
.
graph_attr
)
==
0
assert
len
(
graph
.
graph_attr
)
==
0
assert
len
(
graph
.
edge_attr
)
==
0
assert
len
(
graph
.
edge_attr
)
==
0
graph_body
=
''
.
join
(
graph
.
body
)
graph_body
=
""
.
join
(
graph
.
body
)
assert
'
leaf
'
in
graph_body
assert
"
leaf
"
in
graph_body
assert
'
gain
'
in
graph_body
assert
"
gain
"
in
graph_body
assert
'
value
'
in
graph_body
assert
"
value
"
in
graph_body
assert
'
weight
'
in
graph_body
assert
"
weight
"
in
graph_body
assert
'
data
'
not
in
graph_body
assert
"
data
"
not
in
graph_body
assert
'
count
'
not
in
graph_body
assert
"
count
"
not
in
graph_body
assert
'
||...||
'
in
graph_body
assert
"
||...||
"
in
graph_body
@
pytest
.
mark
.
parametrize
(
'
use_missing
'
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"
use_missing
"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
'
zero_as_missing
'
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"
zero_as_missing
"
,
[
True
,
False
])
def
test_numeric_split_direction
(
use_missing
,
zero_as_missing
):
def
test_numeric_split_direction
(
use_missing
,
zero_as_missing
):
if
use_missing
and
zero_as_missing
:
if
use_missing
and
zero_as_missing
:
pytest
.
skip
(
'
use_missing and zero_as_missing both set to True
'
)
pytest
.
skip
(
"
use_missing and zero_as_missing both set to True
"
)
X
,
y
=
make_synthetic_regression
()
X
,
y
=
make_synthetic_regression
()
rng
=
np
.
random
.
RandomState
(
0
)
rng
=
np
.
random
.
RandomState
(
0
)
zero_mask
=
rng
.
rand
(
X
.
shape
[
0
])
<
0.05
zero_mask
=
rng
.
rand
(
X
.
shape
[
0
])
<
0.05
...
@@ -294,48 +314,48 @@ def test_numeric_split_direction(use_missing, zero_as_missing):
...
@@ -294,48 +314,48 @@ def test_numeric_split_direction(use_missing, zero_as_missing):
X
[
nan_mask
,
:]
=
np
.
nan
X
[
nan_mask
,
:]
=
np
.
nan
ds
=
lgb
.
Dataset
(
X
,
y
)
ds
=
lgb
.
Dataset
(
X
,
y
)
params
=
{
params
=
{
'
num_leaves
'
:
127
,
"
num_leaves
"
:
127
,
'
min_child_samples
'
:
1
,
"
min_child_samples
"
:
1
,
'
use_missing
'
:
use_missing
,
"
use_missing
"
:
use_missing
,
'
zero_as_missing
'
:
zero_as_missing
,
"
zero_as_missing
"
:
zero_as_missing
,
}
}
bst
=
lgb
.
train
(
params
,
ds
,
num_boost_round
=
1
)
bst
=
lgb
.
train
(
params
,
ds
,
num_boost_round
=
1
)
case_with_zero
=
X
[
zero_mask
][[
0
]]
case_with_zero
=
X
[
zero_mask
][[
0
]]
expected_leaf_zero
=
bst
.
predict
(
case_with_zero
,
pred_leaf
=
True
)[
0
]
expected_leaf_zero
=
bst
.
predict
(
case_with_zero
,
pred_leaf
=
True
)[
0
]
node
=
bst
.
dump_model
()[
'
tree_info
'
][
0
][
'
tree_structure
'
]
node
=
bst
.
dump_model
()[
"
tree_info
"
][
0
][
"
tree_structure
"
]
while
'
decision_type
'
in
node
:
while
"
decision_type
"
in
node
:
direction
=
lgb
.
plotting
.
_determine_direction_for_numeric_split
(
direction
=
lgb
.
plotting
.
_determine_direction_for_numeric_split
(
case_with_zero
[
0
][
node
[
'
split_feature
'
]],
node
[
'
threshold
'
],
node
[
'
missing_type
'
],
node
[
'
default_left
'
]
case_with_zero
[
0
][
node
[
"
split_feature
"
]],
node
[
"
threshold
"
],
node
[
"
missing_type
"
],
node
[
"
default_left
"
]
)
)
node
=
node
[
'
left_child
'
]
if
direction
==
'
left
'
else
node
[
'
right_child
'
]
node
=
node
[
"
left_child
"
]
if
direction
==
"
left
"
else
node
[
"
right_child
"
]
assert
node
[
'
leaf_index
'
]
==
expected_leaf_zero
assert
node
[
"
leaf_index
"
]
==
expected_leaf_zero
if
use_missing
:
if
use_missing
:
case_with_nan
=
X
[
nan_mask
][[
0
]]
case_with_nan
=
X
[
nan_mask
][[
0
]]
expected_leaf_nan
=
bst
.
predict
(
case_with_nan
,
pred_leaf
=
True
)[
0
]
expected_leaf_nan
=
bst
.
predict
(
case_with_nan
,
pred_leaf
=
True
)[
0
]
node
=
bst
.
dump_model
()[
'
tree_info
'
][
0
][
'
tree_structure
'
]
node
=
bst
.
dump_model
()[
"
tree_info
"
][
0
][
"
tree_structure
"
]
while
'
decision_type
'
in
node
:
while
"
decision_type
"
in
node
:
direction
=
lgb
.
plotting
.
_determine_direction_for_numeric_split
(
direction
=
lgb
.
plotting
.
_determine_direction_for_numeric_split
(
case_with_nan
[
0
][
node
[
'
split_feature
'
]],
node
[
'
threshold
'
],
node
[
'
missing_type
'
],
node
[
'
default_left
'
]
case_with_nan
[
0
][
node
[
"
split_feature
"
]],
node
[
"
threshold
"
],
node
[
"
missing_type
"
],
node
[
"
default_left
"
]
)
)
node
=
node
[
'
left_child
'
]
if
direction
==
'
left
'
else
node
[
'
right_child
'
]
node
=
node
[
"
left_child
"
]
if
direction
==
"
left
"
else
node
[
"
right_child
"
]
assert
node
[
'
leaf_index
'
]
==
expected_leaf_nan
assert
node
[
"
leaf_index
"
]
==
expected_leaf_nan
assert
expected_leaf_zero
!=
expected_leaf_nan
assert
expected_leaf_zero
!=
expected_leaf_nan
@
pytest
.
mark
.
skipif
(
not
GRAPHVIZ_INSTALLED
,
reason
=
'
graphviz is not installed
'
)
@
pytest
.
mark
.
skipif
(
not
GRAPHVIZ_INSTALLED
,
reason
=
"
graphviz is not installed
"
)
def
test_example_case_in_tree_digraph
():
def
test_example_case_in_tree_digraph
():
rng
=
np
.
random
.
RandomState
(
0
)
rng
=
np
.
random
.
RandomState
(
0
)
x1
=
rng
.
rand
(
100
)
x1
=
rng
.
rand
(
100
)
cat
=
rng
.
randint
(
1
,
3
,
size
=
x1
.
size
)
cat
=
rng
.
randint
(
1
,
3
,
size
=
x1
.
size
)
X
=
np
.
vstack
([
x1
,
cat
]).
T
X
=
np
.
vstack
([
x1
,
cat
]).
T
y
=
x1
+
2
*
cat
y
=
x1
+
2
*
cat
feature_name
=
[
'
x1
'
,
'
cat
'
]
feature_name
=
[
"
x1
"
,
"
cat
"
]
ds
=
lgb
.
Dataset
(
X
,
y
,
feature_name
=
feature_name
,
categorical_feature
=
[
'
cat
'
])
ds
=
lgb
.
Dataset
(
X
,
y
,
feature_name
=
feature_name
,
categorical_feature
=
[
"
cat
"
])
num_round
=
3
num_round
=
3
bst
=
lgb
.
train
({
'
num_leaves
'
:
7
},
ds
,
num_boost_round
=
num_round
)
bst
=
lgb
.
train
({
"
num_leaves
"
:
7
},
ds
,
num_boost_round
=
num_round
)
mod
=
bst
.
dump_model
()
mod
=
bst
.
dump_model
()
example_case
=
X
[[
0
]]
example_case
=
X
[[
0
]]
makes_categorical_splits
=
False
makes_categorical_splits
=
False
...
@@ -343,42 +363,46 @@ def test_example_case_in_tree_digraph():
...
@@ -343,42 +363,46 @@ def test_example_case_in_tree_digraph():
for
i
in
range
(
num_round
):
for
i
in
range
(
num_round
):
graph
=
lgb
.
create_tree_digraph
(
bst
,
example_case
=
example_case
,
tree_index
=
i
)
graph
=
lgb
.
create_tree_digraph
(
bst
,
example_case
=
example_case
,
tree_index
=
i
)
gbody
=
graph
.
body
gbody
=
graph
.
body
node
=
mod
[
'
tree_info
'
][
i
][
'
tree_structure
'
]
node
=
mod
[
"
tree_info
"
][
i
][
"
tree_structure
"
]
while
'
decision_type
'
in
node
:
# iterate through the splits
while
"
decision_type
"
in
node
:
# iterate through the splits
split_index
=
node
[
'
split_index
'
]
split_index
=
node
[
"
split_index
"
]
node_in_graph
=
[
n
for
n
in
gbody
if
f
'
split
{
split_index
}
'
in
n
and
'
->
'
not
in
n
]
node_in_graph
=
[
n
for
n
in
gbody
if
f
"
split
{
split_index
}
"
in
n
and
"
->
"
not
in
n
]
assert
len
(
node_in_graph
)
==
1
assert
len
(
node_in_graph
)
==
1
seen_indices
.
add
(
gbody
.
index
(
node_in_graph
[
0
]))
seen_indices
.
add
(
gbody
.
index
(
node_in_graph
[
0
]))
edge_to_node
=
[
e
for
e
in
gbody
if
f
'
-> split
{
split_index
}
'
in
e
]
edge_to_node
=
[
e
for
e
in
gbody
if
f
"
-> split
{
split_index
}
"
in
e
]
if
node
[
'
decision_type
'
]
==
'
<=
'
:
if
node
[
"
decision_type
"
]
==
"
<=
"
:
direction
=
lgb
.
plotting
.
_determine_direction_for_numeric_split
(
direction
=
lgb
.
plotting
.
_determine_direction_for_numeric_split
(
example_case
[
0
][
node
[
'split_feature'
]],
node
[
'threshold'
],
node
[
'missing_type'
],
node
[
'default_left'
])
example_case
[
0
][
node
[
"split_feature"
]],
node
[
"threshold"
],
node
[
"missing_type"
],
node
[
"default_left"
],
)
else
:
else
:
makes_categorical_splits
=
True
makes_categorical_splits
=
True
direction
=
lgb
.
plotting
.
_determine_direction_for_categorical_split
(
direction
=
lgb
.
plotting
.
_determine_direction_for_categorical_split
(
example_case
[
0
][
node
[
'
split_feature
'
]],
node
[
'
threshold
'
]
example_case
[
0
][
node
[
"
split_feature
"
]],
node
[
"
threshold
"
]
)
)
node
=
node
[
'
left_child
'
]
if
direction
==
'
left
'
else
node
[
'
right_child
'
]
node
=
node
[
"
left_child
"
]
if
direction
==
"
left
"
else
node
[
"
right_child
"
]
assert
'
color=blue
'
in
node_in_graph
[
0
]
assert
"
color=blue
"
in
node_in_graph
[
0
]
if
edge_to_node
:
if
edge_to_node
:
assert
len
(
edge_to_node
)
==
1
assert
len
(
edge_to_node
)
==
1
assert
'
color=blue
'
in
edge_to_node
[
0
]
assert
"
color=blue
"
in
edge_to_node
[
0
]
seen_indices
.
add
(
gbody
.
index
(
edge_to_node
[
0
]))
seen_indices
.
add
(
gbody
.
index
(
edge_to_node
[
0
]))
# we're in a leaf now
# we're in a leaf now
leaf_index
=
node
[
'
leaf_index
'
]
leaf_index
=
node
[
"
leaf_index
"
]
leaf_in_graph
=
[
n
for
n
in
gbody
if
f
'
leaf
{
leaf_index
}
'
in
n
and
'
->
'
not
in
n
]
leaf_in_graph
=
[
n
for
n
in
gbody
if
f
"
leaf
{
leaf_index
}
"
in
n
and
"
->
"
not
in
n
]
edge_to_leaf
=
[
e
for
e
in
gbody
if
f
'
-> leaf
{
leaf_index
}
'
in
e
]
edge_to_leaf
=
[
e
for
e
in
gbody
if
f
"
-> leaf
{
leaf_index
}
"
in
e
]
assert
len
(
leaf_in_graph
)
==
1
assert
len
(
leaf_in_graph
)
==
1
assert
'
color=blue
'
in
leaf_in_graph
[
0
]
assert
"
color=blue
"
in
leaf_in_graph
[
0
]
assert
len
(
edge_to_leaf
)
==
1
assert
len
(
edge_to_leaf
)
==
1
assert
'
color=blue
'
in
edge_to_leaf
[
0
]
assert
"
color=blue
"
in
edge_to_leaf
[
0
]
seen_indices
.
update
([
gbody
.
index
(
leaf_in_graph
[
0
]),
gbody
.
index
(
edge_to_leaf
[
0
])])
seen_indices
.
update
([
gbody
.
index
(
leaf_in_graph
[
0
]),
gbody
.
index
(
edge_to_leaf
[
0
])])
# check that the rest of the elements have black color
# check that the rest of the elements have black color
remaining_elements
=
[
e
for
i
,
e
in
enumerate
(
graph
.
body
)
if
i
not
in
seen_indices
and
'
graph
'
not
in
e
]
remaining_elements
=
[
e
for
i
,
e
in
enumerate
(
graph
.
body
)
if
i
not
in
seen_indices
and
"
graph
"
not
in
e
]
assert
all
(
'
color=black
'
in
e
for
e
in
remaining_elements
)
assert
all
(
"
color=black
"
in
e
for
e
in
remaining_elements
)
# check that we got to the expected leaf
# check that we got to the expected leaf
expected_leaf
=
bst
.
predict
(
example_case
,
start_iteration
=
i
,
num_iteration
=
1
,
pred_leaf
=
True
)[
0
]
expected_leaf
=
bst
.
predict
(
example_case
,
start_iteration
=
i
,
num_iteration
=
1
,
pred_leaf
=
True
)[
0
]
...
@@ -386,83 +410,86 @@ def test_example_case_in_tree_digraph():
...
@@ -386,83 +410,86 @@ def test_example_case_in_tree_digraph():
assert
makes_categorical_splits
assert
makes_categorical_splits
@
pytest
.
mark
.
skipif
(
not
GRAPHVIZ_INSTALLED
,
reason
=
'
graphviz is not installed
'
)
@
pytest
.
mark
.
skipif
(
not
GRAPHVIZ_INSTALLED
,
reason
=
"
graphviz is not installed
"
)
@
pytest
.
mark
.
parametrize
(
'
input_type
'
,
[
'
array
'
,
'
dataframe
'
])
@
pytest
.
mark
.
parametrize
(
"
input_type
"
,
[
"
array
"
,
"
dataframe
"
])
def
test_empty_example_case_on_tree_digraph_raises_error
(
input_type
):
def
test_empty_example_case_on_tree_digraph_raises_error
(
input_type
):
X
,
y
=
make_synthetic_regression
()
X
,
y
=
make_synthetic_regression
()
if
input_type
==
'
dataframe
'
:
if
input_type
==
"
dataframe
"
:
if
not
PANDAS_INSTALLED
:
if
not
PANDAS_INSTALLED
:
pytest
.
skip
(
reason
=
'
pandas is not installed
'
)
pytest
.
skip
(
reason
=
"
pandas is not installed
"
)
X
=
pd_DataFrame
(
X
)
X
=
pd_DataFrame
(
X
)
ds
=
lgb
.
Dataset
(
X
,
y
)
ds
=
lgb
.
Dataset
(
X
,
y
)
bst
=
lgb
.
train
({
'
num_leaves
'
:
3
},
ds
,
num_boost_round
=
1
)
bst
=
lgb
.
train
({
"
num_leaves
"
:
3
},
ds
,
num_boost_round
=
1
)
example_case
=
X
[:
0
]
example_case
=
X
[:
0
]
if
input_type
==
'
dataframe
'
:
if
input_type
==
"
dataframe
"
:
example_case
=
pd_DataFrame
(
example_case
)
example_case
=
pd_DataFrame
(
example_case
)
with
pytest
.
raises
(
ValueError
,
match
=
'
example_case must have a single row.
'
):
with
pytest
.
raises
(
ValueError
,
match
=
"
example_case must have a single row.
"
):
lgb
.
create_tree_digraph
(
bst
,
tree_index
=
0
,
example_case
=
example_case
)
lgb
.
create_tree_digraph
(
bst
,
tree_index
=
0
,
example_case
=
example_case
)
@
pytest
.
mark
.
skipif
(
not
MATPLOTLIB_INSTALLED
,
reason
=
'
matplotlib is not installed
'
)
@
pytest
.
mark
.
skipif
(
not
MATPLOTLIB_INSTALLED
,
reason
=
"
matplotlib is not installed
"
)
def
test_plot_metrics
(
params
,
breast_cancer_split
,
train_data
):
def
test_plot_metrics
(
params
,
breast_cancer_split
,
train_data
):
X_train
,
X_test
,
y_train
,
y_test
=
breast_cancer_split
X_train
,
X_test
,
y_train
,
y_test
=
breast_cancer_split
test_data
=
lgb
.
Dataset
(
X_test
,
y_test
,
reference
=
train_data
)
test_data
=
lgb
.
Dataset
(
X_test
,
y_test
,
reference
=
train_data
)
params
.
update
({
"metric"
:
{
"binary_logloss"
,
"binary_error"
}})
params
.
update
({
"metric"
:
{
"binary_logloss"
,
"binary_error"
}})
evals_result0
=
{}
evals_result0
=
{}
lgb
.
train
(
params
,
train_data
,
lgb
.
train
(
valid_sets
=
[
train_data
,
test_data
],
params
,
valid_names
=
[
'v1'
,
'v2'
],
train_data
,
num_boost_round
=
10
,
valid_sets
=
[
train_data
,
test_data
],
callbacks
=
[
lgb
.
record_evaluation
(
evals_result0
)])
valid_names
=
[
"v1"
,
"v2"
],
num_boost_round
=
10
,
callbacks
=
[
lgb
.
record_evaluation
(
evals_result0
)],
)
with
pytest
.
warns
(
UserWarning
,
match
=
"More than one metric available, picking one to plot."
):
with
pytest
.
warns
(
UserWarning
,
match
=
"More than one metric available, picking one to plot."
):
ax0
=
lgb
.
plot_metric
(
evals_result0
)
ax0
=
lgb
.
plot_metric
(
evals_result0
)
assert
isinstance
(
ax0
,
matplotlib
.
axes
.
Axes
)
assert
isinstance
(
ax0
,
matplotlib
.
axes
.
Axes
)
assert
ax0
.
get_title
()
==
'
Metric during training
'
assert
ax0
.
get_title
()
==
"
Metric during training
"
assert
ax0
.
get_xlabel
()
==
'
Iterations
'
assert
ax0
.
get_xlabel
()
==
"
Iterations
"
assert
ax0
.
get_ylabel
()
in
{
'
binary_logloss
'
,
'
binary_error
'
}
assert
ax0
.
get_ylabel
()
in
{
"
binary_logloss
"
,
"
binary_error
"
}
legend_items
=
ax0
.
get_legend
().
get_texts
()
legend_items
=
ax0
.
get_legend
().
get_texts
()
assert
len
(
legend_items
)
==
2
assert
len
(
legend_items
)
==
2
assert
legend_items
[
0
].
get_text
()
==
'
v1
'
assert
legend_items
[
0
].
get_text
()
==
"
v1
"
assert
legend_items
[
1
].
get_text
()
==
'
v2
'
assert
legend_items
[
1
].
get_text
()
==
"
v2
"
ax1
=
lgb
.
plot_metric
(
evals_result0
,
metric
=
'
binary_error
'
)
ax1
=
lgb
.
plot_metric
(
evals_result0
,
metric
=
"
binary_error
"
)
assert
isinstance
(
ax1
,
matplotlib
.
axes
.
Axes
)
assert
isinstance
(
ax1
,
matplotlib
.
axes
.
Axes
)
assert
ax1
.
get_title
()
==
'
Metric during training
'
assert
ax1
.
get_title
()
==
"
Metric during training
"
assert
ax1
.
get_xlabel
()
==
'
Iterations
'
assert
ax1
.
get_xlabel
()
==
"
Iterations
"
assert
ax1
.
get_ylabel
()
==
'
binary_error
'
assert
ax1
.
get_ylabel
()
==
"
binary_error
"
legend_items
=
ax1
.
get_legend
().
get_texts
()
legend_items
=
ax1
.
get_legend
().
get_texts
()
assert
len
(
legend_items
)
==
2
assert
len
(
legend_items
)
==
2
assert
legend_items
[
0
].
get_text
()
==
'
v1
'
assert
legend_items
[
0
].
get_text
()
==
"
v1
"
assert
legend_items
[
1
].
get_text
()
==
'
v2
'
assert
legend_items
[
1
].
get_text
()
==
"
v2
"
ax2
=
lgb
.
plot_metric
(
evals_result0
,
metric
=
'
binary_logloss
'
,
dataset_names
=
[
'
v2
'
])
ax2
=
lgb
.
plot_metric
(
evals_result0
,
metric
=
"
binary_logloss
"
,
dataset_names
=
[
"
v2
"
])
assert
isinstance
(
ax2
,
matplotlib
.
axes
.
Axes
)
assert
isinstance
(
ax2
,
matplotlib
.
axes
.
Axes
)
assert
ax2
.
get_title
()
==
'
Metric during training
'
assert
ax2
.
get_title
()
==
"
Metric during training
"
assert
ax2
.
get_xlabel
()
==
'
Iterations
'
assert
ax2
.
get_xlabel
()
==
"
Iterations
"
assert
ax2
.
get_ylabel
()
==
'
binary_logloss
'
assert
ax2
.
get_ylabel
()
==
"
binary_logloss
"
legend_items
=
ax2
.
get_legend
().
get_texts
()
legend_items
=
ax2
.
get_legend
().
get_texts
()
assert
len
(
legend_items
)
==
1
assert
len
(
legend_items
)
==
1
assert
legend_items
[
0
].
get_text
()
==
'
v2
'
assert
legend_items
[
0
].
get_text
()
==
"
v2
"
ax3
=
lgb
.
plot_metric
(
ax3
=
lgb
.
plot_metric
(
evals_result0
,
evals_result0
,
metric
=
'
binary_logloss
'
,
metric
=
"
binary_logloss
"
,
dataset_names
=
[
'
v1
'
],
dataset_names
=
[
"
v1
"
],
title
=
'
Metric @metric@
'
,
title
=
"
Metric @metric@
"
,
xlabel
=
'
Iterations @metric@
'
,
xlabel
=
"
Iterations @metric@
"
,
ylabel
=
'Value of "@metric@"'
,
ylabel
=
'Value of "@metric@"'
,
figsize
=
(
5
,
5
),
figsize
=
(
5
,
5
),
dpi
=
600
,
dpi
=
600
,
grid
=
False
grid
=
False
,
)
)
assert
isinstance
(
ax3
,
matplotlib
.
axes
.
Axes
)
assert
isinstance
(
ax3
,
matplotlib
.
axes
.
Axes
)
assert
ax3
.
get_title
()
==
'
Metric @metric@
'
assert
ax3
.
get_title
()
==
"
Metric @metric@
"
assert
ax3
.
get_xlabel
()
==
'
Iterations @metric@
'
assert
ax3
.
get_xlabel
()
==
"
Iterations @metric@
"
assert
ax3
.
get_ylabel
()
==
'Value of "binary_logloss"'
assert
ax3
.
get_ylabel
()
==
'Value of "binary_logloss"'
legend_items
=
ax3
.
get_legend
().
get_texts
()
legend_items
=
ax3
.
get_legend
().
get_texts
()
assert
len
(
legend_items
)
==
1
assert
len
(
legend_items
)
==
1
assert
legend_items
[
0
].
get_text
()
==
'
v1
'
assert
legend_items
[
0
].
get_text
()
==
"
v1
"
assert
ax3
.
get_figure
().
get_figheight
()
==
5
assert
ax3
.
get_figure
().
get_figheight
()
==
5
assert
ax3
.
get_figure
().
get_figwidth
()
==
5
assert
ax3
.
get_figure
().
get_figwidth
()
==
5
assert
ax3
.
get_figure
().
get_dpi
()
==
600
assert
ax3
.
get_figure
().
get_dpi
()
==
600
...
@@ -472,9 +499,7 @@ def test_plot_metrics(params, breast_cancer_split, train_data):
...
@@ -472,9 +499,7 @@ def test_plot_metrics(params, breast_cancer_split, train_data):
assert
not
grid_line
.
get_visible
()
assert
not
grid_line
.
get_visible
()
evals_result1
=
{}
evals_result1
=
{}
lgb
.
train
(
params
,
train_data
,
lgb
.
train
(
params
,
train_data
,
num_boost_round
=
10
,
callbacks
=
[
lgb
.
record_evaluation
(
evals_result1
)])
num_boost_round
=
10
,
callbacks
=
[
lgb
.
record_evaluation
(
evals_result1
)])
with
pytest
.
raises
(
ValueError
,
match
=
"eval results cannot be empty."
):
with
pytest
.
raises
(
ValueError
,
match
=
"eval results cannot be empty."
):
lgb
.
plot_metric
(
evals_result1
)
lgb
.
plot_metric
(
evals_result1
)
...
@@ -482,9 +507,9 @@ def test_plot_metrics(params, breast_cancer_split, train_data):
...
@@ -482,9 +507,9 @@ def test_plot_metrics(params, breast_cancer_split, train_data):
gbm2
.
fit
(
X_train
,
y_train
,
eval_set
=
[(
X_test
,
y_test
)])
gbm2
.
fit
(
X_train
,
y_train
,
eval_set
=
[(
X_test
,
y_test
)])
ax4
=
lgb
.
plot_metric
(
gbm2
,
title
=
None
,
xlabel
=
None
,
ylabel
=
None
)
ax4
=
lgb
.
plot_metric
(
gbm2
,
title
=
None
,
xlabel
=
None
,
ylabel
=
None
)
assert
isinstance
(
ax4
,
matplotlib
.
axes
.
Axes
)
assert
isinstance
(
ax4
,
matplotlib
.
axes
.
Axes
)
assert
ax4
.
get_title
()
==
''
assert
ax4
.
get_title
()
==
""
assert
ax4
.
get_xlabel
()
==
''
assert
ax4
.
get_xlabel
()
==
""
assert
ax4
.
get_ylabel
()
==
''
assert
ax4
.
get_ylabel
()
==
""
legend_items
=
ax4
.
get_legend
().
get_texts
()
legend_items
=
ax4
.
get_legend
().
get_texts
()
assert
len
(
legend_items
)
==
1
assert
len
(
legend_items
)
==
1
assert
legend_items
[
0
].
get_text
()
==
'
valid_0
'
assert
legend_items
[
0
].
get_text
()
==
"
valid_0
"
tests/python_package_test/test_sklearn.py
View file @
1b792e71
...
@@ -23,32 +23,40 @@ from sklearn.utils.validation import check_is_fitted
...
@@ -23,32 +23,40 @@ from sklearn.utils.validation import check_is_fitted
import
lightgbm
as
lgb
import
lightgbm
as
lgb
from
lightgbm.compat
import
DATATABLE_INSTALLED
,
PANDAS_INSTALLED
,
dt_DataTable
,
pd_DataFrame
,
pd_Series
from
lightgbm.compat
import
DATATABLE_INSTALLED
,
PANDAS_INSTALLED
,
dt_DataTable
,
pd_DataFrame
,
pd_Series
from
.utils
import
(
load_breast_cancer
,
load_digits
,
load_iris
,
load_linnerud
,
make_ranking
,
make_synthetic_regression
,
from
.utils
import
(
sklearn_multiclass_custom_objective
,
softmax
)
load_breast_cancer
,
load_digits
,
load_iris
,
load_linnerud
,
make_ranking
,
make_synthetic_regression
,
sklearn_multiclass_custom_objective
,
softmax
,
)
decreasing_generator
=
itertools
.
count
(
0
,
-
1
)
decreasing_generator
=
itertools
.
count
(
0
,
-
1
)
task_to_model_factory
=
{
task_to_model_factory
=
{
'
ranking
'
:
lgb
.
LGBMRanker
,
"
ranking
"
:
lgb
.
LGBMRanker
,
'
binary-classification
'
:
lgb
.
LGBMClassifier
,
"
binary-classification
"
:
lgb
.
LGBMClassifier
,
'
multiclass-classification
'
:
lgb
.
LGBMClassifier
,
"
multiclass-classification
"
:
lgb
.
LGBMClassifier
,
'
regression
'
:
lgb
.
LGBMRegressor
,
"
regression
"
:
lgb
.
LGBMRegressor
,
}
}
def
_create_data
(
task
,
n_samples
=
100
,
n_features
=
4
):
def
_create_data
(
task
,
n_samples
=
100
,
n_features
=
4
):
if
task
==
'
ranking
'
:
if
task
==
"
ranking
"
:
X
,
y
,
g
=
make_ranking
(
n_features
=
4
,
n_samples
=
n_samples
)
X
,
y
,
g
=
make_ranking
(
n_features
=
4
,
n_samples
=
n_samples
)
g
=
np
.
bincount
(
g
)
g
=
np
.
bincount
(
g
)
elif
task
.
endswith
(
'
classification
'
):
elif
task
.
endswith
(
"
classification
"
):
if
task
==
'
binary-classification
'
:
if
task
==
"
binary-classification
"
:
centers
=
2
centers
=
2
elif
task
==
'
multiclass-classification
'
:
elif
task
==
"
multiclass-classification
"
:
centers
=
3
centers
=
3
else
:
else
:
ValueError
(
f
"Unknown classification task '
{
task
}
'"
)
ValueError
(
f
"Unknown classification task '
{
task
}
'"
)
X
,
y
=
make_blobs
(
n_samples
=
n_samples
,
n_features
=
n_features
,
centers
=
centers
,
random_state
=
42
)
X
,
y
=
make_blobs
(
n_samples
=
n_samples
,
n_features
=
n_features
,
centers
=
centers
,
random_state
=
42
)
g
=
None
g
=
None
elif
task
==
'
regression
'
:
elif
task
==
"
regression
"
:
X
,
y
=
make_synthetic_regression
(
n_samples
=
n_samples
,
n_features
=
n_features
)
X
,
y
=
make_synthetic_regression
(
n_samples
=
n_samples
,
n_features
=
n_features
)
g
=
None
g
=
None
return
X
,
y
,
g
return
X
,
y
,
g
...
@@ -70,7 +78,7 @@ def custom_asymmetric_obj(y_true, y_pred):
...
@@ -70,7 +78,7 @@ def custom_asymmetric_obj(y_true, y_pred):
def
objective_ls
(
y_true
,
y_pred
):
def
objective_ls
(
y_true
,
y_pred
):
grad
=
(
y_pred
-
y_true
)
grad
=
y_pred
-
y_true
hess
=
np
.
ones
(
len
(
y_true
))
hess
=
np
.
ones
(
len
(
y_true
))
return
grad
,
hess
return
grad
,
hess
...
@@ -87,15 +95,15 @@ def custom_dummy_obj(y_true, y_pred):
...
@@ -87,15 +95,15 @@ def custom_dummy_obj(y_true, y_pred):
def
constant_metric
(
y_true
,
y_pred
):
def
constant_metric
(
y_true
,
y_pred
):
return
'
error
'
,
0
,
False
return
"
error
"
,
0
,
False
def
decreasing_metric
(
y_true
,
y_pred
):
def
decreasing_metric
(
y_true
,
y_pred
):
return
(
'
decreasing_metric
'
,
next
(
decreasing_generator
),
False
)
return
(
"
decreasing_metric
"
,
next
(
decreasing_generator
),
False
)
def
mse
(
y_true
,
y_pred
):
def
mse
(
y_true
,
y_pred
):
return
'
custom MSE
'
,
mean_squared_error
(
y_true
,
y_pred
),
False
return
"
custom MSE
"
,
mean_squared_error
(
y_true
,
y_pred
),
False
def
binary_error
(
y_true
,
y_pred
):
def
binary_error
(
y_true
,
y_pred
):
...
@@ -117,7 +125,7 @@ def test_binary():
...
@@ -117,7 +125,7 @@ def test_binary():
gbm
.
fit
(
X_train
,
y_train
,
eval_set
=
[(
X_test
,
y_test
)],
callbacks
=
[
lgb
.
early_stopping
(
5
)])
gbm
.
fit
(
X_train
,
y_train
,
eval_set
=
[(
X_test
,
y_test
)],
callbacks
=
[
lgb
.
early_stopping
(
5
)])
ret
=
log_loss
(
y_test
,
gbm
.
predict_proba
(
X_test
))
ret
=
log_loss
(
y_test
,
gbm
.
predict_proba
(
X_test
))
assert
ret
<
0.12
assert
ret
<
0.12
assert
gbm
.
evals_result_
[
'
valid_0
'
][
'
binary_logloss
'
][
gbm
.
best_iteration_
-
1
]
==
pytest
.
approx
(
ret
)
assert
gbm
.
evals_result_
[
"
valid_0
"
][
"
binary_logloss
"
][
gbm
.
best_iteration_
-
1
]
==
pytest
.
approx
(
ret
)
def
test_regression
():
def
test_regression
():
...
@@ -127,10 +135,12 @@ def test_regression():
...
@@ -127,10 +135,12 @@ def test_regression():
gbm
.
fit
(
X_train
,
y_train
,
eval_set
=
[(
X_test
,
y_test
)],
callbacks
=
[
lgb
.
early_stopping
(
5
)])
gbm
.
fit
(
X_train
,
y_train
,
eval_set
=
[(
X_test
,
y_test
)],
callbacks
=
[
lgb
.
early_stopping
(
5
)])
ret
=
mean_squared_error
(
y_test
,
gbm
.
predict
(
X_test
))
ret
=
mean_squared_error
(
y_test
,
gbm
.
predict
(
X_test
))
assert
ret
<
174
assert
ret
<
174
assert
gbm
.
evals_result_
[
'
valid_0
'
][
'
l2
'
][
gbm
.
best_iteration_
-
1
]
==
pytest
.
approx
(
ret
)
assert
gbm
.
evals_result_
[
"
valid_0
"
][
"
l2
"
][
gbm
.
best_iteration_
-
1
]
==
pytest
.
approx
(
ret
)
@
pytest
.
mark
.
skipif
(
getenv
(
'TASK'
,
''
)
==
'cuda'
,
reason
=
'Skip due to differences in implementation details of CUDA version'
)
@
pytest
.
mark
.
skipif
(
getenv
(
"TASK"
,
""
)
==
"cuda"
,
reason
=
"Skip due to differences in implementation details of CUDA version"
)
def
test_multiclass
():
def
test_multiclass
():
X
,
y
=
load_digits
(
n_class
=
10
,
return_X_y
=
True
)
X
,
y
=
load_digits
(
n_class
=
10
,
return_X_y
=
True
)
X_train
,
X_test
,
y_train
,
y_test
=
train_test_split
(
X
,
y
,
test_size
=
0.1
,
random_state
=
42
)
X_train
,
X_test
,
y_train
,
y_test
=
train_test_split
(
X
,
y
,
test_size
=
0.1
,
random_state
=
42
)
...
@@ -140,16 +150,18 @@ def test_multiclass():
...
@@ -140,16 +150,18 @@ def test_multiclass():
assert
ret
<
0.05
assert
ret
<
0.05
ret
=
multi_logloss
(
y_test
,
gbm
.
predict_proba
(
X_test
))
ret
=
multi_logloss
(
y_test
,
gbm
.
predict_proba
(
X_test
))
assert
ret
<
0.16
assert
ret
<
0.16
assert
gbm
.
evals_result_
[
'
valid_0
'
][
'
multi_logloss
'
][
gbm
.
best_iteration_
-
1
]
==
pytest
.
approx
(
ret
)
assert
gbm
.
evals_result_
[
"
valid_0
"
][
"
multi_logloss
"
][
gbm
.
best_iteration_
-
1
]
==
pytest
.
approx
(
ret
)
@
pytest
.
mark
.
skipif
(
getenv
(
'TASK'
,
''
)
==
'cuda'
,
reason
=
'Skip due to differences in implementation details of CUDA version'
)
@
pytest
.
mark
.
skipif
(
getenv
(
"TASK"
,
""
)
==
"cuda"
,
reason
=
"Skip due to differences in implementation details of CUDA version"
)
def
test_lambdarank
():
def
test_lambdarank
():
rank_example_dir
=
Path
(
__file__
).
absolute
().
parents
[
2
]
/
'
examples
'
/
'
lambdarank
'
rank_example_dir
=
Path
(
__file__
).
absolute
().
parents
[
2
]
/
"
examples
"
/
"
lambdarank
"
X_train
,
y_train
=
load_svmlight_file
(
str
(
rank_example_dir
/
'
rank.train
'
))
X_train
,
y_train
=
load_svmlight_file
(
str
(
rank_example_dir
/
"
rank.train
"
))
X_test
,
y_test
=
load_svmlight_file
(
str
(
rank_example_dir
/
'
rank.test
'
))
X_test
,
y_test
=
load_svmlight_file
(
str
(
rank_example_dir
/
"
rank.test
"
))
q_train
=
np
.
loadtxt
(
str
(
rank_example_dir
/
'
rank.train.query
'
))
q_train
=
np
.
loadtxt
(
str
(
rank_example_dir
/
"
rank.train.query
"
))
q_test
=
np
.
loadtxt
(
str
(
rank_example_dir
/
'
rank.test.query
'
))
q_test
=
np
.
loadtxt
(
str
(
rank_example_dir
/
"
rank.test.query
"
))
gbm
=
lgb
.
LGBMRanker
(
n_estimators
=
50
)
gbm
=
lgb
.
LGBMRanker
(
n_estimators
=
50
)
gbm
.
fit
(
gbm
.
fit
(
X_train
,
X_train
,
...
@@ -158,23 +170,20 @@ def test_lambdarank():
...
@@ -158,23 +170,20 @@ def test_lambdarank():
eval_set
=
[(
X_test
,
y_test
)],
eval_set
=
[(
X_test
,
y_test
)],
eval_group
=
[
q_test
],
eval_group
=
[
q_test
],
eval_at
=
[
1
,
3
],
eval_at
=
[
1
,
3
],
callbacks
=
[
callbacks
=
[
lgb
.
early_stopping
(
10
),
lgb
.
reset_parameter
(
learning_rate
=
lambda
x
:
max
(
0.01
,
0.1
-
0.01
*
x
))],
lgb
.
early_stopping
(
10
),
lgb
.
reset_parameter
(
learning_rate
=
lambda
x
:
max
(
0.01
,
0.1
-
0.01
*
x
))
]
)
)
assert
gbm
.
best_iteration_
<=
24
assert
gbm
.
best_iteration_
<=
24
assert
gbm
.
best_score_
[
'
valid_0
'
][
'
ndcg@1
'
]
>
0.5674
assert
gbm
.
best_score_
[
"
valid_0
"
][
"
ndcg@1
"
]
>
0.5674
assert
gbm
.
best_score_
[
'
valid_0
'
][
'
ndcg@3
'
]
>
0.578
assert
gbm
.
best_score_
[
"
valid_0
"
][
"
ndcg@3
"
]
>
0.578
def
test_xendcg
():
def
test_xendcg
():
xendcg_example_dir
=
Path
(
__file__
).
absolute
().
parents
[
2
]
/
'
examples
'
/
'
xendcg
'
xendcg_example_dir
=
Path
(
__file__
).
absolute
().
parents
[
2
]
/
"
examples
"
/
"
xendcg
"
X_train
,
y_train
=
load_svmlight_file
(
str
(
xendcg_example_dir
/
'
rank.train
'
))
X_train
,
y_train
=
load_svmlight_file
(
str
(
xendcg_example_dir
/
"
rank.train
"
))
X_test
,
y_test
=
load_svmlight_file
(
str
(
xendcg_example_dir
/
'
rank.test
'
))
X_test
,
y_test
=
load_svmlight_file
(
str
(
xendcg_example_dir
/
"
rank.test
"
))
q_train
=
np
.
loadtxt
(
str
(
xendcg_example_dir
/
'
rank.train.query
'
))
q_train
=
np
.
loadtxt
(
str
(
xendcg_example_dir
/
"
rank.train.query
"
))
q_test
=
np
.
loadtxt
(
str
(
xendcg_example_dir
/
'
rank.test.query
'
))
q_test
=
np
.
loadtxt
(
str
(
xendcg_example_dir
/
"
rank.test.query
"
))
gbm
=
lgb
.
LGBMRanker
(
n_estimators
=
50
,
objective
=
'
rank_xendcg
'
,
random_state
=
5
,
n_jobs
=
1
)
gbm
=
lgb
.
LGBMRanker
(
n_estimators
=
50
,
objective
=
"
rank_xendcg
"
,
random_state
=
5
,
n_jobs
=
1
)
gbm
.
fit
(
gbm
.
fit
(
X_train
,
X_train
,
y_train
,
y_train
,
...
@@ -182,28 +191,25 @@ def test_xendcg():
...
@@ -182,28 +191,25 @@ def test_xendcg():
eval_set
=
[(
X_test
,
y_test
)],
eval_set
=
[(
X_test
,
y_test
)],
eval_group
=
[
q_test
],
eval_group
=
[
q_test
],
eval_at
=
[
1
,
3
],
eval_at
=
[
1
,
3
],
eval_metric
=
'ndcg'
,
eval_metric
=
"ndcg"
,
callbacks
=
[
callbacks
=
[
lgb
.
early_stopping
(
10
),
lgb
.
reset_parameter
(
learning_rate
=
lambda
x
:
max
(
0.01
,
0.1
-
0.01
*
x
))],
lgb
.
early_stopping
(
10
),
lgb
.
reset_parameter
(
learning_rate
=
lambda
x
:
max
(
0.01
,
0.1
-
0.01
*
x
))
]
)
)
assert
gbm
.
best_iteration_
<=
24
assert
gbm
.
best_iteration_
<=
24
assert
gbm
.
best_score_
[
'
valid_0
'
][
'
ndcg@1
'
]
>
0.6211
assert
gbm
.
best_score_
[
"
valid_0
"
][
"
ndcg@1
"
]
>
0.6211
assert
gbm
.
best_score_
[
'
valid_0
'
][
'
ndcg@3
'
]
>
0.6253
assert
gbm
.
best_score_
[
"
valid_0
"
][
"
ndcg@3
"
]
>
0.6253
def
test_eval_at_aliases
():
def
test_eval_at_aliases
():
rank_example_dir
=
Path
(
__file__
).
absolute
().
parents
[
2
]
/
'
examples
'
/
'
lambdarank
'
rank_example_dir
=
Path
(
__file__
).
absolute
().
parents
[
2
]
/
"
examples
"
/
"
lambdarank
"
X_train
,
y_train
=
load_svmlight_file
(
str
(
rank_example_dir
/
'
rank.train
'
))
X_train
,
y_train
=
load_svmlight_file
(
str
(
rank_example_dir
/
"
rank.train
"
))
X_test
,
y_test
=
load_svmlight_file
(
str
(
rank_example_dir
/
'
rank.test
'
))
X_test
,
y_test
=
load_svmlight_file
(
str
(
rank_example_dir
/
"
rank.test
"
))
q_train
=
np
.
loadtxt
(
str
(
rank_example_dir
/
'
rank.train.query
'
))
q_train
=
np
.
loadtxt
(
str
(
rank_example_dir
/
"
rank.train.query
"
))
q_test
=
np
.
loadtxt
(
str
(
rank_example_dir
/
'
rank.test.query
'
))
q_test
=
np
.
loadtxt
(
str
(
rank_example_dir
/
"
rank.test.query
"
))
for
alias
in
lgb
.
basic
.
_ConfigAliases
.
get
(
'
eval_at
'
):
for
alias
in
lgb
.
basic
.
_ConfigAliases
.
get
(
"
eval_at
"
):
gbm
=
lgb
.
LGBMRanker
(
n_estimators
=
5
,
**
{
alias
:
[
1
,
2
,
3
,
9
]})
gbm
=
lgb
.
LGBMRanker
(
n_estimators
=
5
,
**
{
alias
:
[
1
,
2
,
3
,
9
]})
with
pytest
.
warns
(
UserWarning
,
match
=
f
"Found '
{
alias
}
' in params. Will use it instead of 'eval_at' argument"
):
with
pytest
.
warns
(
UserWarning
,
match
=
f
"Found '
{
alias
}
' in params. Will use it instead of 'eval_at' argument"
):
gbm
.
fit
(
X_train
,
y_train
,
group
=
q_train
,
eval_set
=
[(
X_test
,
y_test
)],
eval_group
=
[
q_test
])
gbm
.
fit
(
X_train
,
y_train
,
group
=
q_train
,
eval_set
=
[(
X_test
,
y_test
)],
eval_group
=
[
q_test
])
assert
list
(
gbm
.
evals_result_
[
'
valid_0
'
].
keys
())
==
[
'
ndcg@1
'
,
'
ndcg@2
'
,
'
ndcg@3
'
,
'
ndcg@9
'
]
assert
list
(
gbm
.
evals_result_
[
"
valid_0
"
].
keys
())
==
[
"
ndcg@1
"
,
"
ndcg@2
"
,
"
ndcg@3
"
,
"
ndcg@9
"
]
@
pytest
.
mark
.
parametrize
(
"custom_objective"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"custom_objective"
,
[
True
,
False
])
...
@@ -212,20 +218,22 @@ def test_objective_aliases(custom_objective):
...
@@ -212,20 +218,22 @@ def test_objective_aliases(custom_objective):
X_train
,
X_test
,
y_train
,
y_test
=
train_test_split
(
X
,
y
,
test_size
=
0.1
,
random_state
=
42
)
X_train
,
X_test
,
y_train
,
y_test
=
train_test_split
(
X
,
y
,
test_size
=
0.1
,
random_state
=
42
)
if
custom_objective
:
if
custom_objective
:
obj
=
custom_dummy_obj
obj
=
custom_dummy_obj
metric_name
=
'
l2
'
# default one
metric_name
=
"
l2
"
# default one
else
:
else
:
obj
=
'
mape
'
obj
=
"
mape
"
metric_name
=
'
mape
'
metric_name
=
"
mape
"
evals
=
[]
evals
=
[]
for
alias
in
lgb
.
basic
.
_ConfigAliases
.
get
(
'
objective
'
):
for
alias
in
lgb
.
basic
.
_ConfigAliases
.
get
(
"
objective
"
):
gbm
=
lgb
.
LGBMRegressor
(
n_estimators
=
5
,
**
{
alias
:
obj
})
gbm
=
lgb
.
LGBMRegressor
(
n_estimators
=
5
,
**
{
alias
:
obj
})
if
alias
!=
'objective'
:
if
alias
!=
"objective"
:
with
pytest
.
warns
(
UserWarning
,
match
=
f
"Found '
{
alias
}
' in params. Will use it instead of 'objective' argument"
):
with
pytest
.
warns
(
UserWarning
,
match
=
f
"Found '
{
alias
}
' in params. Will use it instead of 'objective' argument"
):
gbm
.
fit
(
X_train
,
y_train
,
eval_set
=
[(
X_test
,
y_test
)])
gbm
.
fit
(
X_train
,
y_train
,
eval_set
=
[(
X_test
,
y_test
)])
else
:
else
:
gbm
.
fit
(
X_train
,
y_train
,
eval_set
=
[(
X_test
,
y_test
)])
gbm
.
fit
(
X_train
,
y_train
,
eval_set
=
[(
X_test
,
y_test
)])
assert
list
(
gbm
.
evals_result_
[
'
valid_0
'
].
keys
())
==
[
metric_name
]
assert
list
(
gbm
.
evals_result_
[
"
valid_0
"
].
keys
())
==
[
metric_name
]
evals
.
append
(
gbm
.
evals_result_
[
'
valid_0
'
][
metric_name
])
evals
.
append
(
gbm
.
evals_result_
[
"
valid_0
"
][
metric_name
])
evals_t
=
np
.
array
(
evals
).
T
evals_t
=
np
.
array
(
evals
).
T
for
i
in
range
(
evals_t
.
shape
[
0
]):
for
i
in
range
(
evals_t
.
shape
[
0
]):
np
.
testing
.
assert_allclose
(
evals_t
[
i
],
evals_t
[
i
][
0
])
np
.
testing
.
assert_allclose
(
evals_t
[
i
],
evals_t
[
i
][
0
])
...
@@ -241,7 +249,7 @@ def test_regression_with_custom_objective():
...
@@ -241,7 +249,7 @@ def test_regression_with_custom_objective():
gbm
.
fit
(
X_train
,
y_train
,
eval_set
=
[(
X_test
,
y_test
)],
callbacks
=
[
lgb
.
early_stopping
(
5
)])
gbm
.
fit
(
X_train
,
y_train
,
eval_set
=
[(
X_test
,
y_test
)],
callbacks
=
[
lgb
.
early_stopping
(
5
)])
ret
=
mean_squared_error
(
y_test
,
gbm
.
predict
(
X_test
))
ret
=
mean_squared_error
(
y_test
,
gbm
.
predict
(
X_test
))
assert
ret
<
174
assert
ret
<
174
assert
gbm
.
evals_result_
[
'
valid_0
'
][
'
l2
'
][
gbm
.
best_iteration_
-
1
]
==
pytest
.
approx
(
ret
)
assert
gbm
.
evals_result_
[
"
valid_0
"
][
"
l2
"
][
gbm
.
best_iteration_
-
1
]
==
pytest
.
approx
(
ret
)
def
test_binary_classification_with_custom_objective
():
def
test_binary_classification_with_custom_objective
():
...
@@ -260,7 +268,7 @@ def test_binary_classification_with_custom_objective():
...
@@ -260,7 +268,7 @@ def test_binary_classification_with_custom_objective():
def
test_dart
():
def
test_dart
():
X
,
y
=
make_synthetic_regression
()
X
,
y
=
make_synthetic_regression
()
X_train
,
X_test
,
y_train
,
y_test
=
train_test_split
(
X
,
y
,
test_size
=
0.1
,
random_state
=
42
)
X_train
,
X_test
,
y_train
,
y_test
=
train_test_split
(
X
,
y
,
test_size
=
0.1
,
random_state
=
42
)
gbm
=
lgb
.
LGBMRegressor
(
boosting_type
=
'
dart
'
,
n_estimators
=
50
)
gbm
=
lgb
.
LGBMRegressor
(
boosting_type
=
"
dart
"
,
n_estimators
=
50
)
gbm
.
fit
(
X_train
,
y_train
)
gbm
.
fit
(
X_train
,
y_train
)
score
=
gbm
.
score
(
X_test
,
y_test
)
score
=
gbm
.
score
(
X_test
,
y_test
)
assert
0.8
<=
score
<=
1.0
assert
0.8
<=
score
<=
1.0
...
@@ -269,22 +277,21 @@ def test_dart():
...
@@ -269,22 +277,21 @@ def test_dart():
def
test_stacking_classifier
():
def
test_stacking_classifier
():
X
,
y
=
load_iris
(
return_X_y
=
True
)
X
,
y
=
load_iris
(
return_X_y
=
True
)
X_train
,
X_test
,
y_train
,
y_test
=
train_test_split
(
X
,
y
,
random_state
=
42
)
X_train
,
X_test
,
y_train
,
y_test
=
train_test_split
(
X
,
y
,
random_state
=
42
)
classifiers
=
[(
'gbm1'
,
lgb
.
LGBMClassifier
(
n_estimators
=
3
)),
classifiers
=
[(
"gbm1"
,
lgb
.
LGBMClassifier
(
n_estimators
=
3
)),
(
"gbm2"
,
lgb
.
LGBMClassifier
(
n_estimators
=
3
))]
(
'gbm2'
,
lgb
.
LGBMClassifier
(
n_estimators
=
3
))]
clf
=
StackingClassifier
(
clf
=
StackingClassifier
(
estimators
=
classifiers
,
estimators
=
classifiers
,
final_estimator
=
lgb
.
LGBMClassifier
(
n_estimators
=
3
),
passthrough
=
True
final_estimator
=
lgb
.
LGBMClassifier
(
n_estimators
=
3
),
)
passthrough
=
True
)
clf
.
fit
(
X_train
,
y_train
)
clf
.
fit
(
X_train
,
y_train
)
score
=
clf
.
score
(
X_test
,
y_test
)
score
=
clf
.
score
(
X_test
,
y_test
)
assert
score
>=
0.8
assert
score
>=
0.8
assert
score
<=
1.
assert
score
<=
1.
0
assert
clf
.
n_features_in_
==
4
# number of input features
assert
clf
.
n_features_in_
==
4
# number of input features
assert
len
(
clf
.
named_estimators_
[
'
gbm1
'
].
feature_importances_
)
==
4
assert
len
(
clf
.
named_estimators_
[
"
gbm1
"
].
feature_importances_
)
==
4
assert
clf
.
named_estimators_
[
'
gbm1
'
].
n_features_in_
==
clf
.
named_estimators_
[
'
gbm2
'
].
n_features_in_
assert
clf
.
named_estimators_
[
"
gbm1
"
].
n_features_in_
==
clf
.
named_estimators_
[
"
gbm2
"
].
n_features_in_
assert
clf
.
final_estimator_
.
n_features_in_
==
10
# number of concatenated features
assert
clf
.
final_estimator_
.
n_features_in_
==
10
# number of concatenated features
assert
len
(
clf
.
final_estimator_
.
feature_importances_
)
==
10
assert
len
(
clf
.
final_estimator_
.
feature_importances_
)
==
10
assert
all
(
clf
.
named_estimators_
[
'
gbm1
'
].
classes_
==
clf
.
named_estimators_
[
'
gbm2
'
].
classes_
)
assert
all
(
clf
.
named_estimators_
[
"
gbm1
"
].
classes_
==
clf
.
named_estimators_
[
"
gbm2
"
].
classes_
)
assert
all
(
clf
.
classes_
==
clf
.
named_estimators_
[
'
gbm1
'
].
classes_
)
assert
all
(
clf
.
classes_
==
clf
.
named_estimators_
[
"
gbm1
"
].
classes_
)
def
test_stacking_regressor
():
def
test_stacking_regressor
():
...
@@ -292,18 +299,15 @@ def test_stacking_regressor():
...
@@ -292,18 +299,15 @@ def test_stacking_regressor():
n_features
=
X
.
shape
[
1
]
n_features
=
X
.
shape
[
1
]
n_input_models
=
2
n_input_models
=
2
X_train
,
X_test
,
y_train
,
y_test
=
train_test_split
(
X
,
y
,
random_state
=
42
)
X_train
,
X_test
,
y_train
,
y_test
=
train_test_split
(
X
,
y
,
random_state
=
42
)
regressors
=
[(
'gbm1'
,
lgb
.
LGBMRegressor
(
n_estimators
=
3
)),
regressors
=
[(
"gbm1"
,
lgb
.
LGBMRegressor
(
n_estimators
=
3
)),
(
"gbm2"
,
lgb
.
LGBMRegressor
(
n_estimators
=
3
))]
(
'gbm2'
,
lgb
.
LGBMRegressor
(
n_estimators
=
3
))]
reg
=
StackingRegressor
(
estimators
=
regressors
,
final_estimator
=
lgb
.
LGBMRegressor
(
n_estimators
=
3
),
passthrough
=
True
)
reg
=
StackingRegressor
(
estimators
=
regressors
,
final_estimator
=
lgb
.
LGBMRegressor
(
n_estimators
=
3
),
passthrough
=
True
)
reg
.
fit
(
X_train
,
y_train
)
reg
.
fit
(
X_train
,
y_train
)
score
=
reg
.
score
(
X_test
,
y_test
)
score
=
reg
.
score
(
X_test
,
y_test
)
assert
score
>=
0.2
assert
score
>=
0.2
assert
score
<=
1.
assert
score
<=
1.
0
assert
reg
.
n_features_in_
==
n_features
# number of input features
assert
reg
.
n_features_in_
==
n_features
# number of input features
assert
len
(
reg
.
named_estimators_
[
'
gbm1
'
].
feature_importances_
)
==
n_features
assert
len
(
reg
.
named_estimators_
[
"
gbm1
"
].
feature_importances_
)
==
n_features
assert
reg
.
named_estimators_
[
'
gbm1
'
].
n_features_in_
==
reg
.
named_estimators_
[
'
gbm2
'
].
n_features_in_
assert
reg
.
named_estimators_
[
"
gbm1
"
].
n_features_in_
==
reg
.
named_estimators_
[
"
gbm2
"
].
n_features_in_
assert
reg
.
final_estimator_
.
n_features_in_
==
n_features
+
n_input_models
# number of concatenated features
assert
reg
.
final_estimator_
.
n_features_in_
==
n_features
+
n_input_models
# number of concatenated features
assert
len
(
reg
.
final_estimator_
.
feature_importances_
)
==
n_features
+
n_input_models
assert
len
(
reg
.
final_estimator_
.
feature_importances_
)
==
n_features
+
n_input_models
...
@@ -313,91 +317,69 @@ def test_grid_search():
...
@@ -313,91 +317,69 @@ def test_grid_search():
y
=
y
.
astype
(
str
)
# utilize label encoder at it's max power
y
=
y
.
astype
(
str
)
# utilize label encoder at it's max power
X_train
,
X_test
,
y_train
,
y_test
=
train_test_split
(
X
,
y
,
test_size
=
0.1
,
random_state
=
42
)
X_train
,
X_test
,
y_train
,
y_test
=
train_test_split
(
X
,
y
,
test_size
=
0.1
,
random_state
=
42
)
X_train
,
X_val
,
y_train
,
y_val
=
train_test_split
(
X_train
,
y_train
,
test_size
=
0.1
,
random_state
=
42
)
X_train
,
X_val
,
y_train
,
y_val
=
train_test_split
(
X_train
,
y_train
,
test_size
=
0.1
,
random_state
=
42
)
params
=
{
params
=
{
"subsample"
:
0.8
,
"subsample_freq"
:
1
}
"subsample"
:
0.8
,
grid_params
=
{
"boosting_type"
:
[
"rf"
,
"gbdt"
],
"n_estimators"
:
[
4
,
6
],
"reg_alpha"
:
[
0.01
,
0.005
]}
"subsample_freq"
:
1
}
grid_params
=
{
"boosting_type"
:
[
'rf'
,
'gbdt'
],
"n_estimators"
:
[
4
,
6
],
"reg_alpha"
:
[
0.01
,
0.005
]
}
evals_result
=
{}
evals_result
=
{}
fit_params
=
{
fit_params
=
{
"eval_set"
:
[(
X_val
,
y_val
)],
"eval_set"
:
[(
X_val
,
y_val
)],
"eval_metric"
:
constant_metric
,
"eval_metric"
:
constant_metric
,
"callbacks"
:
[
"callbacks"
:
[
lgb
.
early_stopping
(
2
),
lgb
.
record_evaluation
(
evals_result
)],
lgb
.
early_stopping
(
2
),
lgb
.
record_evaluation
(
evals_result
)
]
}
}
grid
=
GridSearchCV
(
estimator
=
lgb
.
LGBMClassifier
(
**
params
),
param_grid
=
grid_params
,
cv
=
2
)
grid
=
GridSearchCV
(
estimator
=
lgb
.
LGBMClassifier
(
**
params
),
param_grid
=
grid_params
,
cv
=
2
)
grid
.
fit
(
X_train
,
y_train
,
**
fit_params
)
grid
.
fit
(
X_train
,
y_train
,
**
fit_params
)
score
=
grid
.
score
(
X_test
,
y_test
)
# utilizes GridSearchCV default refit=True
score
=
grid
.
score
(
X_test
,
y_test
)
# utilizes GridSearchCV default refit=True
assert
grid
.
best_params_
[
'
boosting_type
'
]
in
[
'
rf
'
,
'
gbdt
'
]
assert
grid
.
best_params_
[
"
boosting_type
"
]
in
[
"
rf
"
,
"
gbdt
"
]
assert
grid
.
best_params_
[
'
n_estimators
'
]
in
[
4
,
6
]
assert
grid
.
best_params_
[
"
n_estimators
"
]
in
[
4
,
6
]
assert
grid
.
best_params_
[
'
reg_alpha
'
]
in
[
0.01
,
0.005
]
assert
grid
.
best_params_
[
"
reg_alpha
"
]
in
[
0.01
,
0.005
]
assert
grid
.
best_score_
<=
1.
assert
grid
.
best_score_
<=
1.
0
assert
grid
.
best_estimator_
.
best_iteration_
==
1
assert
grid
.
best_estimator_
.
best_iteration_
==
1
assert
grid
.
best_estimator_
.
best_score_
[
'
valid_0
'
][
'
multi_logloss
'
]
<
0.25
assert
grid
.
best_estimator_
.
best_score_
[
"
valid_0
"
][
"
multi_logloss
"
]
<
0.25
assert
grid
.
best_estimator_
.
best_score_
[
'
valid_0
'
][
'
error
'
]
==
0
assert
grid
.
best_estimator_
.
best_score_
[
"
valid_0
"
][
"
error
"
]
==
0
assert
score
>=
0.2
assert
score
>=
0.2
assert
score
<=
1.
assert
score
<=
1.
0
assert
evals_result
==
grid
.
best_estimator_
.
evals_result_
assert
evals_result
==
grid
.
best_estimator_
.
evals_result_
def
test_random_search
():
def
test_random_search
():
X
,
y
=
load_iris
(
return_X_y
=
True
)
X
,
y
=
load_iris
(
return_X_y
=
True
)
y
=
y
.
astype
(
str
)
# utilize label encoder at it's max power
y
=
y
.
astype
(
str
)
# utilize label encoder at it's max power
X_train
,
X_test
,
y_train
,
y_test
=
train_test_split
(
X
,
y
,
test_size
=
0.1
,
X_train
,
X_test
,
y_train
,
y_test
=
train_test_split
(
X
,
y
,
test_size
=
0.1
,
random_state
=
42
)
random_state
=
42
)
X_train
,
X_val
,
y_train
,
y_val
=
train_test_split
(
X_train
,
y_train
,
test_size
=
0.1
,
random_state
=
42
)
X_train
,
X_val
,
y_train
,
y_val
=
train_test_split
(
X_train
,
y_train
,
test_size
=
0.1
,
random_state
=
42
)
n_iter
=
3
# Number of samples
n_iter
=
3
# Number of samples
params
=
{
params
=
{
"subsample"
:
0.8
,
"subsample_freq"
:
1
}
"subsample"
:
0.8
,
"subsample_freq"
:
1
}
param_dist
=
{
param_dist
=
{
"boosting_type"
:
[
'
rf
'
,
'
gbdt
'
],
"boosting_type"
:
[
"
rf
"
,
"
gbdt
"
],
"n_estimators"
:
[
np
.
random
.
randint
(
low
=
3
,
high
=
10
)
for
i
in
range
(
n_iter
)],
"n_estimators"
:
[
np
.
random
.
randint
(
low
=
3
,
high
=
10
)
for
i
in
range
(
n_iter
)],
"reg_alpha"
:
[
np
.
random
.
uniform
(
low
=
0.01
,
high
=
0.06
)
for
i
in
range
(
n_iter
)]
"reg_alpha"
:
[
np
.
random
.
uniform
(
low
=
0.01
,
high
=
0.06
)
for
i
in
range
(
n_iter
)],
}
fit_params
=
{
"eval_set"
:
[(
X_val
,
y_val
)],
"eval_metric"
:
constant_metric
,
"callbacks"
:
[
lgb
.
early_stopping
(
2
)]
}
}
rand
=
RandomizedSearchCV
(
estimator
=
lgb
.
LGBMClassifier
(
**
params
),
fit_params
=
{
"eval_set"
:
[(
X_val
,
y_val
)],
"eval_metric"
:
constant_metric
,
"callbacks"
:
[
lgb
.
early_stopping
(
2
)]}
param_distributions
=
param_dist
,
cv
=
2
,
rand
=
RandomizedSearchCV
(
n_iter
=
n_iter
,
random_state
=
42
)
estimator
=
lgb
.
LGBMClassifier
(
**
params
),
param_distributions
=
param_dist
,
cv
=
2
,
n_iter
=
n_iter
,
random_state
=
42
)
rand
.
fit
(
X_train
,
y_train
,
**
fit_params
)
rand
.
fit
(
X_train
,
y_train
,
**
fit_params
)
score
=
rand
.
score
(
X_test
,
y_test
)
# utilizes RandomizedSearchCV default refit=True
score
=
rand
.
score
(
X_test
,
y_test
)
# utilizes RandomizedSearchCV default refit=True
assert
rand
.
best_params_
[
'
boosting_type
'
]
in
[
'
rf
'
,
'
gbdt
'
]
assert
rand
.
best_params_
[
"
boosting_type
"
]
in
[
"
rf
"
,
"
gbdt
"
]
assert
rand
.
best_params_
[
'
n_estimators
'
]
in
list
(
range
(
3
,
10
))
assert
rand
.
best_params_
[
"
n_estimators
"
]
in
list
(
range
(
3
,
10
))
assert
rand
.
best_params_
[
'
reg_alpha
'
]
>=
0.01
# Left-closed boundary point
assert
rand
.
best_params_
[
"
reg_alpha
"
]
>=
0.01
# Left-closed boundary point
assert
rand
.
best_params_
[
'
reg_alpha
'
]
<=
0.06
# Right-closed boundary point
assert
rand
.
best_params_
[
"
reg_alpha
"
]
<=
0.06
# Right-closed boundary point
assert
rand
.
best_score_
<=
1.
assert
rand
.
best_score_
<=
1.
0
assert
rand
.
best_estimator_
.
best_score_
[
'
valid_0
'
][
'
multi_logloss
'
]
<
0.25
assert
rand
.
best_estimator_
.
best_score_
[
"
valid_0
"
][
"
multi_logloss
"
]
<
0.25
assert
rand
.
best_estimator_
.
best_score_
[
'
valid_0
'
][
'
error
'
]
==
0
assert
rand
.
best_estimator_
.
best_score_
[
"
valid_0
"
][
"
error
"
]
==
0
assert
score
>=
0.2
assert
score
>=
0.2
assert
score
<=
1.
assert
score
<=
1.
0
def
test_multioutput_classifier
():
def
test_multioutput_classifier
():
n_outputs
=
3
n_outputs
=
3
X
,
y
=
make_multilabel_classification
(
n_samples
=
100
,
n_features
=
20
,
X
,
y
=
make_multilabel_classification
(
n_samples
=
100
,
n_features
=
20
,
n_classes
=
n_outputs
,
random_state
=
0
)
n_classes
=
n_outputs
,
random_state
=
0
)
y
=
y
.
astype
(
str
)
# utilize label encoder at it's max power
y
=
y
.
astype
(
str
)
# utilize label encoder at it's max power
X_train
,
X_test
,
y_train
,
y_test
=
train_test_split
(
X
,
y
,
test_size
=
0.1
,
X_train
,
X_test
,
y_train
,
y_test
=
train_test_split
(
X
,
y
,
test_size
=
0.1
,
random_state
=
42
)
random_state
=
42
)
clf
=
MultiOutputClassifier
(
estimator
=
lgb
.
LGBMClassifier
(
n_estimators
=
10
))
clf
=
MultiOutputClassifier
(
estimator
=
lgb
.
LGBMClassifier
(
n_estimators
=
10
))
clf
.
fit
(
X_train
,
y_train
)
clf
.
fit
(
X_train
,
y_train
)
score
=
clf
.
score
(
X_test
,
y_test
)
score
=
clf
.
score
(
X_test
,
y_test
)
assert
score
>=
0.2
assert
score
>=
0.2
assert
score
<=
1.
assert
score
<=
1.0
np
.
testing
.
assert_array_equal
(
np
.
tile
(
np
.
unique
(
y_train
),
n_outputs
),
np
.
testing
.
assert_array_equal
(
np
.
tile
(
np
.
unique
(
y_train
),
n_outputs
),
np
.
concatenate
(
clf
.
classes_
))
np
.
concatenate
(
clf
.
classes_
))
for
classifier
in
clf
.
estimators_
:
for
classifier
in
clf
.
estimators_
:
assert
isinstance
(
classifier
,
lgb
.
LGBMClassifier
)
assert
isinstance
(
classifier
,
lgb
.
LGBMClassifier
)
assert
isinstance
(
classifier
.
booster_
,
lgb
.
Booster
)
assert
isinstance
(
classifier
.
booster_
,
lgb
.
Booster
)
...
@@ -405,15 +387,14 @@ def test_multioutput_classifier():
...
@@ -405,15 +387,14 @@ def test_multioutput_classifier():
def
test_multioutput_regressor
():
def
test_multioutput_regressor
():
bunch
=
load_linnerud
(
as_frame
=
True
)
# returns a Bunch instance
bunch
=
load_linnerud
(
as_frame
=
True
)
# returns a Bunch instance
X
,
y
=
bunch
[
'data'
],
bunch
[
'target'
]
X
,
y
=
bunch
[
"data"
],
bunch
[
"target"
]
X_train
,
X_test
,
y_train
,
y_test
=
train_test_split
(
X
,
y
,
test_size
=
0.1
,
X_train
,
X_test
,
y_train
,
y_test
=
train_test_split
(
X
,
y
,
test_size
=
0.1
,
random_state
=
42
)
random_state
=
42
)
reg
=
MultiOutputRegressor
(
estimator
=
lgb
.
LGBMRegressor
(
n_estimators
=
10
))
reg
=
MultiOutputRegressor
(
estimator
=
lgb
.
LGBMRegressor
(
n_estimators
=
10
))
reg
.
fit
(
X_train
,
y_train
)
reg
.
fit
(
X_train
,
y_train
)
y_pred
=
reg
.
predict
(
X_test
)
y_pred
=
reg
.
predict
(
X_test
)
_
,
score
,
_
=
mse
(
y_test
,
y_pred
)
_
,
score
,
_
=
mse
(
y_test
,
y_pred
)
assert
score
>=
0.2
assert
score
>=
0.2
assert
score
<=
120.
assert
score
<=
120.
0
for
regressor
in
reg
.
estimators_
:
for
regressor
in
reg
.
estimators_
:
assert
isinstance
(
regressor
,
lgb
.
LGBMRegressor
)
assert
isinstance
(
regressor
,
lgb
.
LGBMRegressor
)
assert
isinstance
(
regressor
.
booster_
,
lgb
.
Booster
)
assert
isinstance
(
regressor
.
booster_
,
lgb
.
Booster
)
...
@@ -421,19 +402,15 @@ def test_multioutput_regressor():
...
@@ -421,19 +402,15 @@ def test_multioutput_regressor():
def
test_classifier_chain
():
def
test_classifier_chain
():
n_outputs
=
3
n_outputs
=
3
X
,
y
=
make_multilabel_classification
(
n_samples
=
100
,
n_features
=
20
,
X
,
y
=
make_multilabel_classification
(
n_samples
=
100
,
n_features
=
20
,
n_classes
=
n_outputs
,
random_state
=
0
)
n_classes
=
n_outputs
,
random_state
=
0
)
X_train
,
X_test
,
y_train
,
y_test
=
train_test_split
(
X
,
y
,
test_size
=
0.1
,
random_state
=
42
)
X_train
,
X_test
,
y_train
,
y_test
=
train_test_split
(
X
,
y
,
test_size
=
0.1
,
random_state
=
42
)
order
=
[
2
,
0
,
1
]
order
=
[
2
,
0
,
1
]
clf
=
ClassifierChain
(
base_estimator
=
lgb
.
LGBMClassifier
(
n_estimators
=
10
),
clf
=
ClassifierChain
(
base_estimator
=
lgb
.
LGBMClassifier
(
n_estimators
=
10
),
order
=
order
,
random_state
=
42
)
order
=
order
,
random_state
=
42
)
clf
.
fit
(
X_train
,
y_train
)
clf
.
fit
(
X_train
,
y_train
)
score
=
clf
.
score
(
X_test
,
y_test
)
score
=
clf
.
score
(
X_test
,
y_test
)
assert
score
>=
0.2
assert
score
>=
0.2
assert
score
<=
1.
assert
score
<=
1.0
np
.
testing
.
assert_array_equal
(
np
.
tile
(
np
.
unique
(
y_train
),
n_outputs
),
np
.
testing
.
assert_array_equal
(
np
.
tile
(
np
.
unique
(
y_train
),
n_outputs
),
np
.
concatenate
(
clf
.
classes_
))
np
.
concatenate
(
clf
.
classes_
))
assert
order
==
clf
.
order_
assert
order
==
clf
.
order_
for
classifier
in
clf
.
estimators_
:
for
classifier
in
clf
.
estimators_
:
assert
isinstance
(
classifier
,
lgb
.
LGBMClassifier
)
assert
isinstance
(
classifier
,
lgb
.
LGBMClassifier
)
...
@@ -442,16 +419,15 @@ def test_classifier_chain():
...
@@ -442,16 +419,15 @@ def test_classifier_chain():
def
test_regressor_chain
():
def
test_regressor_chain
():
bunch
=
load_linnerud
(
as_frame
=
True
)
# returns a Bunch instance
bunch
=
load_linnerud
(
as_frame
=
True
)
# returns a Bunch instance
X
,
y
=
bunch
[
'
data
'
],
bunch
[
'
target
'
]
X
,
y
=
bunch
[
"
data
"
],
bunch
[
"
target
"
]
X_train
,
X_test
,
y_train
,
y_test
=
train_test_split
(
X
,
y
,
test_size
=
0.1
,
random_state
=
42
)
X_train
,
X_test
,
y_train
,
y_test
=
train_test_split
(
X
,
y
,
test_size
=
0.1
,
random_state
=
42
)
order
=
[
2
,
0
,
1
]
order
=
[
2
,
0
,
1
]
reg
=
RegressorChain
(
base_estimator
=
lgb
.
LGBMRegressor
(
n_estimators
=
10
),
order
=
order
,
reg
=
RegressorChain
(
base_estimator
=
lgb
.
LGBMRegressor
(
n_estimators
=
10
),
order
=
order
,
random_state
=
42
)
random_state
=
42
)
reg
.
fit
(
X_train
,
y_train
)
reg
.
fit
(
X_train
,
y_train
)
y_pred
=
reg
.
predict
(
X_test
)
y_pred
=
reg
.
predict
(
X_test
)
_
,
score
,
_
=
mse
(
y_test
,
y_pred
)
_
,
score
,
_
=
mse
(
y_test
,
y_pred
)
assert
score
>=
0.2
assert
score
>=
0.2
assert
score
<=
120.
assert
score
<=
120.
0
assert
order
==
reg
.
order_
assert
order
==
reg
.
order_
for
regressor
in
reg
.
estimators_
:
for
regressor
in
reg
.
estimators_
:
assert
isinstance
(
regressor
,
lgb
.
LGBMRegressor
)
assert
isinstance
(
regressor
,
lgb
.
LGBMRegressor
)
...
@@ -489,24 +465,17 @@ def test_clone_and_property():
...
@@ -489,24 +465,17 @@ def test_clone_and_property():
def
test_joblib
():
def
test_joblib
():
X
,
y
=
make_synthetic_regression
()
X
,
y
=
make_synthetic_regression
()
X_train
,
X_test
,
y_train
,
y_test
=
train_test_split
(
X
,
y
,
test_size
=
0.1
,
random_state
=
42
)
X_train
,
X_test
,
y_train
,
y_test
=
train_test_split
(
X
,
y
,
test_size
=
0.1
,
random_state
=
42
)
gbm
=
lgb
.
LGBMRegressor
(
n_estimators
=
10
,
objective
=
custom_asymmetric_obj
,
gbm
=
lgb
.
LGBMRegressor
(
n_estimators
=
10
,
objective
=
custom_asymmetric_obj
,
verbose
=-
1
,
importance_type
=
"split"
)
verbose
=-
1
,
importance_type
=
'split'
)
gbm
.
fit
(
gbm
.
fit
(
X_train
,
X_train
,
y_train
,
y_train
,
eval_set
=
[
eval_set
=
[(
X_train
,
y_train
),
(
X_test
,
y_test
)],
(
X_train
,
y_train
),
(
X_test
,
y_test
)
],
eval_metric
=
mse
,
eval_metric
=
mse
,
callbacks
=
[
callbacks
=
[
lgb
.
early_stopping
(
5
),
lgb
.
reset_parameter
(
learning_rate
=
list
(
np
.
arange
(
1
,
0
,
-
0.1
)))],
lgb
.
early_stopping
(
5
),
lgb
.
reset_parameter
(
learning_rate
=
list
(
np
.
arange
(
1
,
0
,
-
0.1
)))
]
)
)
joblib
.
dump
(
gbm
,
'
lgb.pkl
'
)
# test model with custom functions
joblib
.
dump
(
gbm
,
"
lgb.pkl
"
)
# test model with custom functions
gbm_pickle
=
joblib
.
load
(
'
lgb.pkl
'
)
gbm_pickle
=
joblib
.
load
(
"
lgb.pkl
"
)
assert
isinstance
(
gbm_pickle
.
booster_
,
lgb
.
Booster
)
assert
isinstance
(
gbm_pickle
.
booster_
,
lgb
.
Booster
)
assert
gbm
.
get_params
()
==
gbm_pickle
.
get_params
()
assert
gbm
.
get_params
()
==
gbm_pickle
.
get_params
()
np
.
testing
.
assert_array_equal
(
gbm
.
feature_importances_
,
gbm_pickle
.
feature_importances_
)
np
.
testing
.
assert_array_equal
(
gbm
.
feature_importances_
,
gbm_pickle
.
feature_importances_
)
...
@@ -515,8 +484,7 @@ def test_joblib():
...
@@ -515,8 +484,7 @@ def test_joblib():
for
eval_set
in
gbm
.
evals_result_
:
for
eval_set
in
gbm
.
evals_result_
:
for
metric
in
gbm
.
evals_result_
[
eval_set
]:
for
metric
in
gbm
.
evals_result_
[
eval_set
]:
np
.
testing
.
assert_allclose
(
gbm
.
evals_result_
[
eval_set
][
metric
],
np
.
testing
.
assert_allclose
(
gbm
.
evals_result_
[
eval_set
][
metric
],
gbm_pickle
.
evals_result_
[
eval_set
][
metric
])
gbm_pickle
.
evals_result_
[
eval_set
][
metric
])
pred_origin
=
gbm
.
predict
(
X_test
)
pred_origin
=
gbm
.
predict
(
X_test
)
pred_pickle
=
gbm_pickle
.
predict
(
X_test
)
pred_pickle
=
gbm_pickle
.
predict
(
X_test
)
np
.
testing
.
assert_allclose
(
pred_origin
,
pred_pickle
)
np
.
testing
.
assert_allclose
(
pred_origin
,
pred_pickle
)
...
@@ -526,7 +494,7 @@ def test_non_serializable_objects_in_callbacks(tmp_path):
...
@@ -526,7 +494,7 @@ def test_non_serializable_objects_in_callbacks(tmp_path):
unpicklable_callback
=
UnpicklableCallback
()
unpicklable_callback
=
UnpicklableCallback
()
with
pytest
.
raises
(
Exception
,
match
=
"This class in not picklable"
):
with
pytest
.
raises
(
Exception
,
match
=
"This class in not picklable"
):
joblib
.
dump
(
unpicklable_callback
,
tmp_path
/
'
tmp.joblib
'
)
joblib
.
dump
(
unpicklable_callback
,
tmp_path
/
"
tmp.joblib
"
)
X
,
y
=
make_synthetic_regression
()
X
,
y
=
make_synthetic_regression
()
gbm
=
lgb
.
LGBMRegressor
(
n_estimators
=
5
)
gbm
=
lgb
.
LGBMRegressor
(
n_estimators
=
5
)
...
@@ -578,9 +546,9 @@ def test_feature_importances_type():
...
@@ -578,9 +546,9 @@ def test_feature_importances_type():
data
=
load_iris
(
return_X_y
=
False
)
data
=
load_iris
(
return_X_y
=
False
)
clf
=
lgb
.
LGBMClassifier
(
n_estimators
=
10
)
clf
=
lgb
.
LGBMClassifier
(
n_estimators
=
10
)
clf
.
fit
(
data
.
data
,
data
.
target
)
clf
.
fit
(
data
.
data
,
data
.
target
)
clf
.
set_params
(
importance_type
=
'
split
'
)
clf
.
set_params
(
importance_type
=
"
split
"
)
importances_split
=
clf
.
feature_importances_
importances_split
=
clf
.
feature_importances_
clf
.
set_params
(
importance_type
=
'
gain
'
)
clf
.
set_params
(
importance_type
=
"
gain
"
)
importances_gain
=
clf
.
feature_importances_
importances_gain
=
clf
.
feature_importances_
# Test that the largest element is NOT the same, the smallest can be the same, i.e. zero
# Test that the largest element is NOT the same, the smallest can be the same, i.e. zero
importance_split_top1
=
sorted
(
importances_split
,
reverse
=
True
)[
0
]
importance_split_top1
=
sorted
(
importances_split
,
reverse
=
True
)[
0
]
...
@@ -591,38 +559,44 @@ def test_feature_importances_type():
...
@@ -591,38 +559,44 @@ def test_feature_importances_type():
def
test_pandas_categorical
():
def
test_pandas_categorical
():
pd
=
pytest
.
importorskip
(
"pandas"
)
pd
=
pytest
.
importorskip
(
"pandas"
)
np
.
random
.
seed
(
42
)
# sometimes there is no difference how cols are treated (cat or not cat)
np
.
random
.
seed
(
42
)
# sometimes there is no difference how cols are treated (cat or not cat)
X
=
pd
.
DataFrame
({
"A"
:
np
.
random
.
permutation
([
'a'
,
'b'
,
'c'
,
'd'
]
*
75
),
# str
X
=
pd
.
DataFrame
(
"B"
:
np
.
random
.
permutation
([
1
,
2
,
3
]
*
100
),
# int
{
"C"
:
np
.
random
.
permutation
([
0.1
,
0.2
,
-
0.1
,
-
0.1
,
0.2
]
*
60
),
# float
"A"
:
np
.
random
.
permutation
([
"a"
,
"b"
,
"c"
,
"d"
]
*
75
),
# str
"D"
:
np
.
random
.
permutation
([
True
,
False
]
*
150
),
# bool
"B"
:
np
.
random
.
permutation
([
1
,
2
,
3
]
*
100
),
# int
"E"
:
pd
.
Categorical
(
np
.
random
.
permutation
([
'z'
,
'y'
,
'x'
,
'w'
,
'v'
]
*
60
),
"C"
:
np
.
random
.
permutation
([
0.1
,
0.2
,
-
0.1
,
-
0.1
,
0.2
]
*
60
),
# float
ordered
=
True
)})
# str and ordered categorical
"D"
:
np
.
random
.
permutation
([
True
,
False
]
*
150
),
# bool
"E"
:
pd
.
Categorical
(
np
.
random
.
permutation
([
"z"
,
"y"
,
"x"
,
"w"
,
"v"
]
*
60
),
ordered
=
True
),
}
)
# str and ordered categorical
y
=
np
.
random
.
permutation
([
0
,
1
]
*
150
)
y
=
np
.
random
.
permutation
([
0
,
1
]
*
150
)
X_test
=
pd
.
DataFrame
({
"A"
:
np
.
random
.
permutation
([
'a'
,
'b'
,
'e'
]
*
20
),
# unseen category
X_test
=
pd
.
DataFrame
(
"B"
:
np
.
random
.
permutation
([
1
,
3
]
*
30
),
{
"C"
:
np
.
random
.
permutation
([
0.1
,
-
0.1
,
0.2
,
0.2
]
*
15
),
"A"
:
np
.
random
.
permutation
([
"a"
,
"b"
,
"e"
]
*
20
),
# unseen category
"D"
:
np
.
random
.
permutation
([
True
,
False
]
*
30
),
"B"
:
np
.
random
.
permutation
([
1
,
3
]
*
30
),
"E"
:
pd
.
Categorical
(
np
.
random
.
permutation
([
'z'
,
'y'
]
*
30
),
"C"
:
np
.
random
.
permutation
([
0.1
,
-
0.1
,
0.2
,
0.2
]
*
15
),
ordered
=
True
)})
"D"
:
np
.
random
.
permutation
([
True
,
False
]
*
30
),
"E"
:
pd
.
Categorical
(
np
.
random
.
permutation
([
"z"
,
"y"
]
*
30
),
ordered
=
True
),
}
)
np
.
random
.
seed
()
# reset seed
np
.
random
.
seed
()
# reset seed
cat_cols_actual
=
[
"A"
,
"B"
,
"C"
,
"D"
]
cat_cols_actual
=
[
"A"
,
"B"
,
"C"
,
"D"
]
cat_cols_to_store
=
cat_cols_actual
+
[
"E"
]
cat_cols_to_store
=
cat_cols_actual
+
[
"E"
]
X
[
cat_cols_actual
]
=
X
[
cat_cols_actual
].
astype
(
'
category
'
)
X
[
cat_cols_actual
]
=
X
[
cat_cols_actual
].
astype
(
"
category
"
)
X_test
[
cat_cols_actual
]
=
X_test
[
cat_cols_actual
].
astype
(
'
category
'
)
X_test
[
cat_cols_actual
]
=
X_test
[
cat_cols_actual
].
astype
(
"
category
"
)
cat_values
=
[
X
[
col
].
cat
.
categories
.
tolist
()
for
col
in
cat_cols_to_store
]
cat_values
=
[
X
[
col
].
cat
.
categories
.
tolist
()
for
col
in
cat_cols_to_store
]
gbm0
=
lgb
.
sklearn
.
LGBMClassifier
(
n_estimators
=
10
).
fit
(
X
,
y
)
gbm0
=
lgb
.
sklearn
.
LGBMClassifier
(
n_estimators
=
10
).
fit
(
X
,
y
)
pred0
=
gbm0
.
predict
(
X_test
,
raw_score
=
True
)
pred0
=
gbm0
.
predict
(
X_test
,
raw_score
=
True
)
pred_prob
=
gbm0
.
predict_proba
(
X_test
)[:,
1
]
pred_prob
=
gbm0
.
predict_proba
(
X_test
)[:,
1
]
gbm1
=
lgb
.
sklearn
.
LGBMClassifier
(
n_estimators
=
10
).
fit
(
X
,
pd
.
Series
(
y
),
categorical_feature
=
[
0
])
gbm1
=
lgb
.
sklearn
.
LGBMClassifier
(
n_estimators
=
10
).
fit
(
X
,
pd
.
Series
(
y
),
categorical_feature
=
[
0
])
pred1
=
gbm1
.
predict
(
X_test
,
raw_score
=
True
)
pred1
=
gbm1
.
predict
(
X_test
,
raw_score
=
True
)
gbm2
=
lgb
.
sklearn
.
LGBMClassifier
(
n_estimators
=
10
).
fit
(
X
,
y
,
categorical_feature
=
[
'A'
])
gbm2
=
lgb
.
sklearn
.
LGBMClassifier
(
n_estimators
=
10
).
fit
(
X
,
y
,
categorical_feature
=
[
"A"
])
pred2
=
gbm2
.
predict
(
X_test
,
raw_score
=
True
)
pred2
=
gbm2
.
predict
(
X_test
,
raw_score
=
True
)
gbm3
=
lgb
.
sklearn
.
LGBMClassifier
(
n_estimators
=
10
).
fit
(
X
,
y
,
categorical_feature
=
[
'A'
,
'B'
,
'C'
,
'D'
])
gbm3
=
lgb
.
sklearn
.
LGBMClassifier
(
n_estimators
=
10
).
fit
(
X
,
y
,
categorical_feature
=
[
"A"
,
"B"
,
"C"
,
"D"
])
pred3
=
gbm3
.
predict
(
X_test
,
raw_score
=
True
)
pred3
=
gbm3
.
predict
(
X_test
,
raw_score
=
True
)
gbm3
.
booster_
.
save_model
(
'
categorical.model
'
)
gbm3
.
booster_
.
save_model
(
"
categorical.model
"
)
gbm4
=
lgb
.
Booster
(
model_file
=
'
categorical.model
'
)
gbm4
=
lgb
.
Booster
(
model_file
=
"
categorical.model
"
)
pred4
=
gbm4
.
predict
(
X_test
)
pred4
=
gbm4
.
predict
(
X_test
)
gbm5
=
lgb
.
sklearn
.
LGBMClassifier
(
n_estimators
=
10
).
fit
(
X
,
y
,
categorical_feature
=
[
'A'
,
'B'
,
'C'
,
'D'
,
'E'
])
gbm5
=
lgb
.
sklearn
.
LGBMClassifier
(
n_estimators
=
10
).
fit
(
X
,
y
,
categorical_feature
=
[
"A"
,
"B"
,
"C"
,
"D"
,
"E"
])
pred5
=
gbm5
.
predict
(
X_test
,
raw_score
=
True
)
pred5
=
gbm5
.
predict
(
X_test
,
raw_score
=
True
)
gbm6
=
lgb
.
sklearn
.
LGBMClassifier
(
n_estimators
=
10
).
fit
(
X
,
y
,
categorical_feature
=
[])
gbm6
=
lgb
.
sklearn
.
LGBMClassifier
(
n_estimators
=
10
).
fit
(
X
,
y
,
categorical_feature
=
[])
pred6
=
gbm6
.
predict
(
X_test
,
raw_score
=
True
)
pred6
=
gbm6
.
predict
(
X_test
,
raw_score
=
True
)
...
@@ -648,18 +622,26 @@ def test_pandas_categorical():
...
@@ -648,18 +622,26 @@ def test_pandas_categorical():
def
test_pandas_sparse
():
def
test_pandas_sparse
():
pd
=
pytest
.
importorskip
(
"pandas"
)
pd
=
pytest
.
importorskip
(
"pandas"
)
X
=
pd
.
DataFrame
({
"A"
:
pd
.
arrays
.
SparseArray
(
np
.
random
.
permutation
([
0
,
1
,
2
]
*
100
)),
X
=
pd
.
DataFrame
(
"B"
:
pd
.
arrays
.
SparseArray
(
np
.
random
.
permutation
([
0.0
,
0.1
,
0.2
,
-
0.1
,
0.2
]
*
60
)),
{
"C"
:
pd
.
arrays
.
SparseArray
(
np
.
random
.
permutation
([
True
,
False
]
*
150
))})
"A"
:
pd
.
arrays
.
SparseArray
(
np
.
random
.
permutation
([
0
,
1
,
2
]
*
100
)),
"B"
:
pd
.
arrays
.
SparseArray
(
np
.
random
.
permutation
([
0.0
,
0.1
,
0.2
,
-
0.1
,
0.2
]
*
60
)),
"C"
:
pd
.
arrays
.
SparseArray
(
np
.
random
.
permutation
([
True
,
False
]
*
150
)),
}
)
y
=
pd
.
Series
(
pd
.
arrays
.
SparseArray
(
np
.
random
.
permutation
([
0
,
1
]
*
150
)))
y
=
pd
.
Series
(
pd
.
arrays
.
SparseArray
(
np
.
random
.
permutation
([
0
,
1
]
*
150
)))
X_test
=
pd
.
DataFrame
({
"A"
:
pd
.
arrays
.
SparseArray
(
np
.
random
.
permutation
([
0
,
2
]
*
30
)),
X_test
=
pd
.
DataFrame
(
"B"
:
pd
.
arrays
.
SparseArray
(
np
.
random
.
permutation
([
0.0
,
0.1
,
0.2
,
-
0.1
]
*
15
)),
{
"C"
:
pd
.
arrays
.
SparseArray
(
np
.
random
.
permutation
([
True
,
False
]
*
30
))})
"A"
:
pd
.
arrays
.
SparseArray
(
np
.
random
.
permutation
([
0
,
2
]
*
30
)),
"B"
:
pd
.
arrays
.
SparseArray
(
np
.
random
.
permutation
([
0.0
,
0.1
,
0.2
,
-
0.1
]
*
15
)),
"C"
:
pd
.
arrays
.
SparseArray
(
np
.
random
.
permutation
([
True
,
False
]
*
30
)),
}
)
for
dtype
in
pd
.
concat
([
X
.
dtypes
,
X_test
.
dtypes
,
pd
.
Series
(
y
.
dtypes
)]):
for
dtype
in
pd
.
concat
([
X
.
dtypes
,
X_test
.
dtypes
,
pd
.
Series
(
y
.
dtypes
)]):
assert
pd
.
api
.
types
.
is_sparse
(
dtype
)
assert
pd
.
api
.
types
.
is_sparse
(
dtype
)
gbm
=
lgb
.
sklearn
.
LGBMClassifier
(
n_estimators
=
10
).
fit
(
X
,
y
)
gbm
=
lgb
.
sklearn
.
LGBMClassifier
(
n_estimators
=
10
).
fit
(
X
,
y
)
pred_sparse
=
gbm
.
predict
(
X_test
,
raw_score
=
True
)
pred_sparse
=
gbm
.
predict
(
X_test
,
raw_score
=
True
)
if
hasattr
(
X_test
,
'
sparse
'
):
if
hasattr
(
X_test
,
"
sparse
"
):
pred_dense
=
gbm
.
predict
(
X_test
.
sparse
.
to_dense
(),
raw_score
=
True
)
pred_dense
=
gbm
.
predict
(
X_test
.
sparse
.
to_dense
(),
raw_score
=
True
)
else
:
else
:
pred_dense
=
gbm
.
predict
(
X_test
.
to_dense
(),
raw_score
=
True
)
pred_dense
=
gbm
.
predict
(
X_test
.
to_dense
(),
raw_score
=
True
)
...
@@ -669,13 +651,9 @@ def test_pandas_sparse():
...
@@ -669,13 +651,9 @@ def test_pandas_sparse():
def
test_predict
():
def
test_predict
():
# With default params
# With default params
iris
=
load_iris
(
return_X_y
=
False
)
iris
=
load_iris
(
return_X_y
=
False
)
X_train
,
X_test
,
y_train
,
_
=
train_test_split
(
iris
.
data
,
iris
.
target
,
X_train
,
X_test
,
y_train
,
_
=
train_test_split
(
iris
.
data
,
iris
.
target
,
test_size
=
0.2
,
random_state
=
42
)
test_size
=
0.2
,
random_state
=
42
)
gbm
=
lgb
.
train
({
'objective'
:
'multiclass'
,
gbm
=
lgb
.
train
({
"objective"
:
"multiclass"
,
"num_class"
:
3
,
"verbose"
:
-
1
},
lgb
.
Dataset
(
X_train
,
y_train
))
'num_class'
:
3
,
'verbose'
:
-
1
},
lgb
.
Dataset
(
X_train
,
y_train
))
clf
=
lgb
.
LGBMClassifier
(
verbose
=-
1
).
fit
(
X_train
,
y_train
)
clf
=
lgb
.
LGBMClassifier
(
verbose
=-
1
).
fit
(
X_train
,
y_train
)
# Tests same probabilities
# Tests same probabilities
...
@@ -705,9 +683,7 @@ def test_predict():
...
@@ -705,9 +683,7 @@ def test_predict():
# Tests other parameters for the prediction works
# Tests other parameters for the prediction works
res_engine
=
gbm
.
predict
(
X_test
)
res_engine
=
gbm
.
predict
(
X_test
)
res_sklearn_params
=
clf
.
predict_proba
(
X_test
,
res_sklearn_params
=
clf
.
predict_proba
(
X_test
,
pred_early_stop
=
True
,
pred_early_stop_margin
=
1.0
)
pred_early_stop
=
True
,
pred_early_stop_margin
=
1.0
)
with
pytest
.
raises
(
AssertionError
):
with
pytest
.
raises
(
AssertionError
):
np
.
testing
.
assert_allclose
(
res_engine
,
res_sklearn_params
)
np
.
testing
.
assert_allclose
(
res_engine
,
res_sklearn_params
)
...
@@ -739,9 +715,7 @@ def test_predict():
...
@@ -739,9 +715,7 @@ def test_predict():
# Tests other parameters for the prediction works, starting from iteration 10
# Tests other parameters for the prediction works, starting from iteration 10
res_engine
=
gbm
.
predict
(
X_test
,
start_iteration
=
10
)
res_engine
=
gbm
.
predict
(
X_test
,
start_iteration
=
10
)
res_sklearn_params
=
clf
.
predict_proba
(
X_test
,
res_sklearn_params
=
clf
.
predict_proba
(
X_test
,
pred_early_stop
=
True
,
pred_early_stop_margin
=
1.0
,
start_iteration
=
10
)
pred_early_stop
=
True
,
pred_early_stop_margin
=
1.0
,
start_iteration
=
10
)
with
pytest
.
raises
(
AssertionError
):
with
pytest
.
raises
(
AssertionError
):
np
.
testing
.
assert_allclose
(
res_engine
,
res_sklearn_params
)
np
.
testing
.
assert_allclose
(
res_engine
,
res_sklearn_params
)
...
@@ -750,34 +724,43 @@ def test_predict_with_params_from_init():
...
@@ -750,34 +724,43 @@ def test_predict_with_params_from_init():
X
,
y
=
load_iris
(
return_X_y
=
True
)
X
,
y
=
load_iris
(
return_X_y
=
True
)
X_train
,
X_test
,
y_train
,
_
=
train_test_split
(
X
,
y
,
test_size
=
0.2
,
random_state
=
42
)
X_train
,
X_test
,
y_train
,
_
=
train_test_split
(
X
,
y
,
test_size
=
0.2
,
random_state
=
42
)
predict_params
=
{
predict_params
=
{
"pred_early_stop"
:
True
,
"pred_early_stop_margin"
:
1.0
}
'pred_early_stop'
:
True
,
'pred_early_stop_margin'
:
1.0
}
y_preds_no_params
=
lgb
.
LGBMClassifier
(
verbose
=-
1
).
fit
(
X_train
,
y_train
).
predict
(
y_preds_no_params
=
lgb
.
LGBMClassifier
(
verbose
=-
1
).
fit
(
X_train
,
y_train
).
predict
(
X_test
,
raw_score
=
True
)
X_test
,
raw_score
=
True
)
y_preds_params_in_predict
=
lgb
.
LGBMClassifier
(
verbose
=-
1
).
fit
(
X_train
,
y_train
).
predict
(
y_preds_params_in_predict
=
(
X_test
,
raw_score
=
True
,
**
predict_params
)
lgb
.
LGBMClassifier
(
verbose
=-
1
).
fit
(
X_train
,
y_train
).
predict
(
X_test
,
raw_score
=
True
,
**
predict_params
)
)
with
pytest
.
raises
(
AssertionError
):
with
pytest
.
raises
(
AssertionError
):
np
.
testing
.
assert_allclose
(
y_preds_no_params
,
y_preds_params_in_predict
)
np
.
testing
.
assert_allclose
(
y_preds_no_params
,
y_preds_params_in_predict
)
y_preds_params_in_set_params_before_fit
=
lgb
.
LGBMClassifier
(
verbose
=-
1
).
set_params
(
y_preds_params_in_set_params_before_fit
=
(
**
predict_params
).
fit
(
X_train
,
y_train
).
predict
(
X_test
,
raw_score
=
True
)
lgb
.
LGBMClassifier
(
verbose
=-
1
)
.
set_params
(
**
predict_params
)
.
fit
(
X_train
,
y_train
)
.
predict
(
X_test
,
raw_score
=
True
)
)
np
.
testing
.
assert_allclose
(
y_preds_params_in_predict
,
y_preds_params_in_set_params_before_fit
)
np
.
testing
.
assert_allclose
(
y_preds_params_in_predict
,
y_preds_params_in_set_params_before_fit
)
y_preds_params_in_set_params_after_fit
=
lgb
.
LGBMClassifier
(
verbose
=-
1
).
fit
(
X_train
,
y_train
).
set_params
(
y_preds_params_in_set_params_after_fit
=
(
**
predict_params
).
predict
(
X_test
,
raw_score
=
True
)
lgb
.
LGBMClassifier
(
verbose
=-
1
)
.
fit
(
X_train
,
y_train
)
.
set_params
(
**
predict_params
)
.
predict
(
X_test
,
raw_score
=
True
)
)
np
.
testing
.
assert_allclose
(
y_preds_params_in_predict
,
y_preds_params_in_set_params_after_fit
)
np
.
testing
.
assert_allclose
(
y_preds_params_in_predict
,
y_preds_params_in_set_params_after_fit
)
y_preds_params_in_init
=
lgb
.
LGBMClassifier
(
verbose
=-
1
,
**
predict_params
).
fit
(
X_train
,
y_train
).
predict
(
y_preds_params_in_init
=
(
X_test
,
raw_score
=
True
)
lgb
.
LGBMClassifier
(
verbose
=-
1
,
**
predict_params
).
fit
(
X_train
,
y_train
).
predict
(
X_test
,
raw_score
=
True
)
)
np
.
testing
.
assert_allclose
(
y_preds_params_in_predict
,
y_preds_params_in_init
)
np
.
testing
.
assert_allclose
(
y_preds_params_in_predict
,
y_preds_params_in_init
)
# test that params passed in predict have higher priority
# test that params passed in predict have higher priority
y_preds_params_overwritten
=
lgb
.
LGBMClassifier
(
verbose
=-
1
,
**
predict_params
).
fit
(
X_train
,
y_train
).
predict
(
y_preds_params_overwritten
=
(
X_test
,
raw_score
=
True
,
pred_early_stop
=
False
)
lgb
.
LGBMClassifier
(
verbose
=-
1
,
**
predict_params
)
.
fit
(
X_train
,
y_train
)
.
predict
(
X_test
,
raw_score
=
True
,
pred_early_stop
=
False
)
)
np
.
testing
.
assert_allclose
(
y_preds_no_params
,
y_preds_params_overwritten
)
np
.
testing
.
assert_allclose
(
y_preds_no_params
,
y_preds_params_overwritten
)
...
@@ -787,315 +770,307 @@ def test_evaluate_train_set():
...
@@ -787,315 +770,307 @@ def test_evaluate_train_set():
gbm
=
lgb
.
LGBMRegressor
(
n_estimators
=
10
,
verbose
=-
1
)
gbm
=
lgb
.
LGBMRegressor
(
n_estimators
=
10
,
verbose
=-
1
)
gbm
.
fit
(
X_train
,
y_train
,
eval_set
=
[(
X_train
,
y_train
),
(
X_test
,
y_test
)])
gbm
.
fit
(
X_train
,
y_train
,
eval_set
=
[(
X_train
,
y_train
),
(
X_test
,
y_test
)])
assert
len
(
gbm
.
evals_result_
)
==
2
assert
len
(
gbm
.
evals_result_
)
==
2
assert
'
training
'
in
gbm
.
evals_result_
assert
"
training
"
in
gbm
.
evals_result_
assert
len
(
gbm
.
evals_result_
[
'
training
'
])
==
1
assert
len
(
gbm
.
evals_result_
[
"
training
"
])
==
1
assert
'
l2
'
in
gbm
.
evals_result_
[
'
training
'
]
assert
"
l2
"
in
gbm
.
evals_result_
[
"
training
"
]
assert
'
valid_1
'
in
gbm
.
evals_result_
assert
"
valid_1
"
in
gbm
.
evals_result_
assert
len
(
gbm
.
evals_result_
[
'
valid_1
'
])
==
1
assert
len
(
gbm
.
evals_result_
[
"
valid_1
"
])
==
1
assert
'
l2
'
in
gbm
.
evals_result_
[
'
valid_1
'
]
assert
"
l2
"
in
gbm
.
evals_result_
[
"
valid_1
"
]
def
test_metrics
():
def
test_metrics
():
X
,
y
=
make_synthetic_regression
()
X
,
y
=
make_synthetic_regression
()
y
=
abs
(
y
)
y
=
abs
(
y
)
params
=
{
'
n_estimators
'
:
2
,
'
verbose
'
:
-
1
}
params
=
{
"
n_estimators
"
:
2
,
"
verbose
"
:
-
1
}
params_fit
=
{
'X'
:
X
,
'y'
:
y
,
'
eval_set
'
:
(
X
,
y
)}
params_fit
=
{
"X"
:
X
,
"y"
:
y
,
"
eval_set
"
:
(
X
,
y
)}
# no custom objective, no custom metric
# no custom objective, no custom metric
# default metric
# default metric
gbm
=
lgb
.
LGBMRegressor
(
**
params
).
fit
(
**
params_fit
)
gbm
=
lgb
.
LGBMRegressor
(
**
params
).
fit
(
**
params_fit
)
assert
len
(
gbm
.
evals_result_
[
'
training
'
])
==
1
assert
len
(
gbm
.
evals_result_
[
"
training
"
])
==
1
assert
'
l2
'
in
gbm
.
evals_result_
[
'
training
'
]
assert
"
l2
"
in
gbm
.
evals_result_
[
"
training
"
]
# non-default metric
# non-default metric
gbm
=
lgb
.
LGBMRegressor
(
metric
=
'
mape
'
,
**
params
).
fit
(
**
params_fit
)
gbm
=
lgb
.
LGBMRegressor
(
metric
=
"
mape
"
,
**
params
).
fit
(
**
params_fit
)
assert
len
(
gbm
.
evals_result_
[
'
training
'
])
==
1
assert
len
(
gbm
.
evals_result_
[
"
training
"
])
==
1
assert
'
mape
'
in
gbm
.
evals_result_
[
'
training
'
]
assert
"
mape
"
in
gbm
.
evals_result_
[
"
training
"
]
# no metric
# no metric
gbm
=
lgb
.
LGBMRegressor
(
metric
=
'
None
'
,
**
params
).
fit
(
**
params_fit
)
gbm
=
lgb
.
LGBMRegressor
(
metric
=
"
None
"
,
**
params
).
fit
(
**
params_fit
)
assert
gbm
.
evals_result_
==
{}
assert
gbm
.
evals_result_
==
{}
# non-default metric in eval_metric
# non-default metric in eval_metric
gbm
=
lgb
.
LGBMRegressor
(
**
params
).
fit
(
eval_metric
=
'
mape
'
,
**
params_fit
)
gbm
=
lgb
.
LGBMRegressor
(
**
params
).
fit
(
eval_metric
=
"
mape
"
,
**
params_fit
)
assert
len
(
gbm
.
evals_result_
[
'
training
'
])
==
2
assert
len
(
gbm
.
evals_result_
[
"
training
"
])
==
2
assert
'
l2
'
in
gbm
.
evals_result_
[
'
training
'
]
assert
"
l2
"
in
gbm
.
evals_result_
[
"
training
"
]
assert
'
mape
'
in
gbm
.
evals_result_
[
'
training
'
]
assert
"
mape
"
in
gbm
.
evals_result_
[
"
training
"
]
# non-default metric with non-default metric in eval_metric
# non-default metric with non-default metric in eval_metric
gbm
=
lgb
.
LGBMRegressor
(
metric
=
'
gamma
'
,
**
params
).
fit
(
eval_metric
=
'
mape
'
,
**
params_fit
)
gbm
=
lgb
.
LGBMRegressor
(
metric
=
"
gamma
"
,
**
params
).
fit
(
eval_metric
=
"
mape
"
,
**
params_fit
)
assert
len
(
gbm
.
evals_result_
[
'
training
'
])
==
2
assert
len
(
gbm
.
evals_result_
[
"
training
"
])
==
2
assert
'
gamma
'
in
gbm
.
evals_result_
[
'
training
'
]
assert
"
gamma
"
in
gbm
.
evals_result_
[
"
training
"
]
assert
'
mape
'
in
gbm
.
evals_result_
[
'
training
'
]
assert
"
mape
"
in
gbm
.
evals_result_
[
"
training
"
]
# non-default metric with multiple metrics in eval_metric
# non-default metric with multiple metrics in eval_metric
gbm
=
lgb
.
LGBMRegressor
(
metric
=
'gamma'
,
gbm
=
lgb
.
LGBMRegressor
(
metric
=
"gamma"
,
**
params
).
fit
(
eval_metric
=
[
"l2"
,
"mape"
],
**
params_fit
)
**
params
).
fit
(
eval_metric
=
[
'l2'
,
'mape'
],
**
params_fit
)
assert
len
(
gbm
.
evals_result_
[
"training"
])
==
3
assert
len
(
gbm
.
evals_result_
[
'training'
])
==
3
assert
"gamma"
in
gbm
.
evals_result_
[
"training"
]
assert
'gamma'
in
gbm
.
evals_result_
[
'training'
]
assert
"l2"
in
gbm
.
evals_result_
[
"training"
]
assert
'l2'
in
gbm
.
evals_result_
[
'training'
]
assert
"mape"
in
gbm
.
evals_result_
[
"training"
]
assert
'mape'
in
gbm
.
evals_result_
[
'training'
]
# non-default metric with multiple metrics in eval_metric for LGBMClassifier
# non-default metric with multiple metrics in eval_metric for LGBMClassifier
X_classification
,
y_classification
=
load_breast_cancer
(
return_X_y
=
True
)
X_classification
,
y_classification
=
load_breast_cancer
(
return_X_y
=
True
)
params_classification
=
{
'n_estimators'
:
2
,
'verbose'
:
-
1
,
params_classification
=
{
"n_estimators"
:
2
,
"verbose"
:
-
1
,
"objective"
:
"binary"
,
"metric"
:
"binary_logloss"
}
'objective'
:
'binary'
,
'metric'
:
'binary_logloss'
}
params_fit_classification
=
{
params_fit_classification
=
{
'X'
:
X_classification
,
'y'
:
y_classification
,
"X"
:
X_classification
,
'eval_set'
:
(
X_classification
,
y_classification
)}
"y"
:
y_classification
,
gbm
=
lgb
.
LGBMClassifier
(
**
params_classification
).
fit
(
eval_metric
=
[
'fair'
,
'error'
],
"eval_set"
:
(
X_classification
,
y_classification
),
**
params_fit_classification
)
}
assert
len
(
gbm
.
evals_result_
[
'training'
])
==
3
gbm
=
lgb
.
LGBMClassifier
(
**
params_classification
).
fit
(
eval_metric
=
[
"fair"
,
"error"
],
**
params_fit_classification
)
assert
'fair'
in
gbm
.
evals_result_
[
'training'
]
assert
len
(
gbm
.
evals_result_
[
"training"
])
==
3
assert
'binary_error'
in
gbm
.
evals_result_
[
'training'
]
assert
"fair"
in
gbm
.
evals_result_
[
"training"
]
assert
'binary_logloss'
in
gbm
.
evals_result_
[
'training'
]
assert
"binary_error"
in
gbm
.
evals_result_
[
"training"
]
assert
"binary_logloss"
in
gbm
.
evals_result_
[
"training"
]
# default metric for non-default objective
# default metric for non-default objective
gbm
=
lgb
.
LGBMRegressor
(
objective
=
'
regression_l1
'
,
**
params
).
fit
(
**
params_fit
)
gbm
=
lgb
.
LGBMRegressor
(
objective
=
"
regression_l1
"
,
**
params
).
fit
(
**
params_fit
)
assert
len
(
gbm
.
evals_result_
[
'
training
'
])
==
1
assert
len
(
gbm
.
evals_result_
[
"
training
"
])
==
1
assert
'
l1
'
in
gbm
.
evals_result_
[
'
training
'
]
assert
"
l1
"
in
gbm
.
evals_result_
[
"
training
"
]
# non-default metric for non-default objective
# non-default metric for non-default objective
gbm
=
lgb
.
LGBMRegressor
(
objective
=
'regression_l1'
,
metric
=
'mape'
,
gbm
=
lgb
.
LGBMRegressor
(
objective
=
"regression_l1"
,
metric
=
"mape"
,
**
params
).
fit
(
**
params_fit
)
**
params
).
fit
(
**
params_fit
)
assert
len
(
gbm
.
evals_result_
[
"training"
])
==
1
assert
len
(
gbm
.
evals_result_
[
'training'
])
==
1
assert
"mape"
in
gbm
.
evals_result_
[
"training"
]
assert
'mape'
in
gbm
.
evals_result_
[
'training'
]
# no metric
# no metric
gbm
=
lgb
.
LGBMRegressor
(
objective
=
'regression_l1'
,
metric
=
'None'
,
gbm
=
lgb
.
LGBMRegressor
(
objective
=
"regression_l1"
,
metric
=
"None"
,
**
params
).
fit
(
**
params_fit
)
**
params
).
fit
(
**
params_fit
)
assert
gbm
.
evals_result_
==
{}
assert
gbm
.
evals_result_
==
{}
# non-default metric in eval_metric for non-default objective
# non-default metric in eval_metric for non-default objective
gbm
=
lgb
.
LGBMRegressor
(
objective
=
'regression_l1'
,
gbm
=
lgb
.
LGBMRegressor
(
objective
=
"regression_l1"
,
**
params
).
fit
(
eval_metric
=
"mape"
,
**
params_fit
)
**
params
).
fit
(
eval_metric
=
'mape'
,
**
params_fit
)
assert
len
(
gbm
.
evals_result_
[
"training"
])
==
2
assert
len
(
gbm
.
evals_result_
[
'training'
])
==
2
assert
"l1"
in
gbm
.
evals_result_
[
"training"
]
assert
'l1'
in
gbm
.
evals_result_
[
'training'
]
assert
"mape"
in
gbm
.
evals_result_
[
"training"
]
assert
'mape'
in
gbm
.
evals_result_
[
'training'
]
# non-default metric with non-default metric in eval_metric for non-default objective
# non-default metric with non-default metric in eval_metric for non-default objective
gbm
=
lgb
.
LGBMRegressor
(
objective
=
'regression_l1'
,
metric
=
'gamma'
,
gbm
=
lgb
.
LGBMRegressor
(
objective
=
"regression_l1"
,
metric
=
"gamma"
,
**
params
).
fit
(
eval_metric
=
"mape"
,
**
params_fit
)
**
params
).
fit
(
eval_metric
=
'mape'
,
**
params_fit
)
assert
len
(
gbm
.
evals_result_
[
"training"
])
==
2
assert
len
(
gbm
.
evals_result_
[
'training'
])
==
2
assert
"gamma"
in
gbm
.
evals_result_
[
"training"
]
assert
'gamma'
in
gbm
.
evals_result_
[
'training'
]
assert
"mape"
in
gbm
.
evals_result_
[
"training"
]
assert
'mape'
in
gbm
.
evals_result_
[
'training'
]
# non-default metric with multiple metrics in eval_metric for non-default objective
# non-default metric with multiple metrics in eval_metric for non-default objective
gbm
=
lgb
.
LGBMRegressor
(
objective
=
'regression_l1'
,
metric
=
'gamma'
,
gbm
=
lgb
.
LGBMRegressor
(
objective
=
"regression_l1"
,
metric
=
"gamma"
,
**
params
).
fit
(
**
params
).
fit
(
eval_metric
=
[
'l2'
,
'mape'
],
**
params_fit
)
eval_metric
=
[
"l2"
,
"mape"
],
**
params_fit
assert
len
(
gbm
.
evals_result_
[
'training'
])
==
3
)
assert
'gamma'
in
gbm
.
evals_result_
[
'training'
]
assert
len
(
gbm
.
evals_result_
[
"training"
])
==
3
assert
'l2'
in
gbm
.
evals_result_
[
'training'
]
assert
"gamma"
in
gbm
.
evals_result_
[
"training"
]
assert
'mape'
in
gbm
.
evals_result_
[
'training'
]
assert
"l2"
in
gbm
.
evals_result_
[
"training"
]
assert
"mape"
in
gbm
.
evals_result_
[
"training"
]
# custom objective, no custom metric
# custom objective, no custom metric
# default regression metric for custom objective
# default regression metric for custom objective
gbm
=
lgb
.
LGBMRegressor
(
objective
=
custom_dummy_obj
,
**
params
).
fit
(
**
params_fit
)
gbm
=
lgb
.
LGBMRegressor
(
objective
=
custom_dummy_obj
,
**
params
).
fit
(
**
params_fit
)
assert
len
(
gbm
.
evals_result_
[
'
training
'
])
==
1
assert
len
(
gbm
.
evals_result_
[
"
training
"
])
==
1
assert
'
l2
'
in
gbm
.
evals_result_
[
'
training
'
]
assert
"
l2
"
in
gbm
.
evals_result_
[
"
training
"
]
# non-default regression metric for custom objective
# non-default regression metric for custom objective
gbm
=
lgb
.
LGBMRegressor
(
objective
=
custom_dummy_obj
,
metric
=
'
mape
'
,
**
params
).
fit
(
**
params_fit
)
gbm
=
lgb
.
LGBMRegressor
(
objective
=
custom_dummy_obj
,
metric
=
"
mape
"
,
**
params
).
fit
(
**
params_fit
)
assert
len
(
gbm
.
evals_result_
[
'
training
'
])
==
1
assert
len
(
gbm
.
evals_result_
[
"
training
"
])
==
1
assert
'
mape
'
in
gbm
.
evals_result_
[
'
training
'
]
assert
"
mape
"
in
gbm
.
evals_result_
[
"
training
"
]
# multiple regression metrics for custom objective
# multiple regression metrics for custom objective
gbm
=
lgb
.
LGBMRegressor
(
objective
=
custom_dummy_obj
,
metric
=
[
'l1'
,
'gamma'
],
gbm
=
lgb
.
LGBMRegressor
(
objective
=
custom_dummy_obj
,
metric
=
[
"l1"
,
"gamma"
],
**
params
).
fit
(
**
params_fit
)
**
params
).
fit
(
**
params_fit
)
assert
len
(
gbm
.
evals_result_
[
"training"
])
==
2
assert
len
(
gbm
.
evals_result_
[
'training'
])
==
2
assert
"l1"
in
gbm
.
evals_result_
[
"training"
]
assert
'l1'
in
gbm
.
evals_result_
[
'training'
]
assert
"gamma"
in
gbm
.
evals_result_
[
"training"
]
assert
'gamma'
in
gbm
.
evals_result_
[
'training'
]
# no metric
# no metric
gbm
=
lgb
.
LGBMRegressor
(
objective
=
custom_dummy_obj
,
metric
=
'None'
,
gbm
=
lgb
.
LGBMRegressor
(
objective
=
custom_dummy_obj
,
metric
=
"None"
,
**
params
).
fit
(
**
params_fit
)
**
params
).
fit
(
**
params_fit
)
assert
gbm
.
evals_result_
==
{}
assert
gbm
.
evals_result_
==
{}
# default regression metric with non-default metric in eval_metric for custom objective
# default regression metric with non-default metric in eval_metric for custom objective
gbm
=
lgb
.
LGBMRegressor
(
objective
=
custom_dummy_obj
,
gbm
=
lgb
.
LGBMRegressor
(
objective
=
custom_dummy_obj
,
**
params
).
fit
(
eval_metric
=
"mape"
,
**
params_fit
)
**
params
).
fit
(
eval_metric
=
'mape'
,
**
params_fit
)
assert
len
(
gbm
.
evals_result_
[
"training"
])
==
2
assert
len
(
gbm
.
evals_result_
[
'training'
])
==
2
assert
"l2"
in
gbm
.
evals_result_
[
"training"
]
assert
'l2'
in
gbm
.
evals_result_
[
'training'
]
assert
"mape"
in
gbm
.
evals_result_
[
"training"
]
assert
'mape'
in
gbm
.
evals_result_
[
'training'
]
# non-default regression metric with metric in eval_metric for custom objective
# non-default regression metric with metric in eval_metric for custom objective
gbm
=
lgb
.
LGBMRegressor
(
objective
=
custom_dummy_obj
,
metric
=
'mape'
,
gbm
=
lgb
.
LGBMRegressor
(
objective
=
custom_dummy_obj
,
metric
=
"mape"
,
**
params
).
fit
(
eval_metric
=
"gamma"
,
**
params_fit
)
**
params
).
fit
(
eval_metric
=
'gamma'
,
**
params_fit
)
assert
len
(
gbm
.
evals_result_
[
"training"
])
==
2
assert
len
(
gbm
.
evals_result_
[
'training'
])
==
2
assert
"mape"
in
gbm
.
evals_result_
[
"training"
]
assert
'mape'
in
gbm
.
evals_result_
[
'training'
]
assert
"gamma"
in
gbm
.
evals_result_
[
"training"
]
assert
'gamma'
in
gbm
.
evals_result_
[
'training'
]
# multiple regression metrics with metric in eval_metric for custom objective
# multiple regression metrics with metric in eval_metric for custom objective
gbm
=
lgb
.
LGBMRegressor
(
objective
=
custom_dummy_obj
,
metric
=
[
'l1'
,
'gamma'
],
gbm
=
lgb
.
LGBMRegressor
(
objective
=
custom_dummy_obj
,
metric
=
[
"l1"
,
"gamma"
],
**
params
).
fit
(
**
params
).
fit
(
eval_metric
=
'l2'
,
**
params_fit
)
eval_metric
=
"l2"
,
**
params_fit
assert
len
(
gbm
.
evals_result_
[
'training'
])
==
3
)
assert
'l1'
in
gbm
.
evals_result_
[
'training'
]
assert
len
(
gbm
.
evals_result_
[
"training"
])
==
3
assert
'gamma'
in
gbm
.
evals_result_
[
'training'
]
assert
"l1"
in
gbm
.
evals_result_
[
"training"
]
assert
'l2'
in
gbm
.
evals_result_
[
'training'
]
assert
"gamma"
in
gbm
.
evals_result_
[
"training"
]
assert
"l2"
in
gbm
.
evals_result_
[
"training"
]
# multiple regression metrics with multiple metrics in eval_metric for custom objective
# multiple regression metrics with multiple metrics in eval_metric for custom objective
gbm
=
lgb
.
LGBMRegressor
(
objective
=
custom_dummy_obj
,
metric
=
[
'l1'
,
'gamma'
],
gbm
=
lgb
.
LGBMRegressor
(
objective
=
custom_dummy_obj
,
metric
=
[
"l1"
,
"gamma"
],
**
params
).
fit
(
**
params
).
fit
(
eval_metric
=
[
'l2'
,
'mape'
],
**
params_fit
)
eval_metric
=
[
"l2"
,
"mape"
],
**
params_fit
assert
len
(
gbm
.
evals_result_
[
'training'
])
==
4
)
assert
'l1'
in
gbm
.
evals_result_
[
'training'
]
assert
len
(
gbm
.
evals_result_
[
"training"
])
==
4
assert
'gamma'
in
gbm
.
evals_result_
[
'training'
]
assert
"l1"
in
gbm
.
evals_result_
[
"training"
]
assert
'l2'
in
gbm
.
evals_result_
[
'training'
]
assert
"gamma"
in
gbm
.
evals_result_
[
"training"
]
assert
'mape'
in
gbm
.
evals_result_
[
'training'
]
assert
"l2"
in
gbm
.
evals_result_
[
"training"
]
assert
"mape"
in
gbm
.
evals_result_
[
"training"
]
# no custom objective, custom metric
# no custom objective, custom metric
# default metric with custom metric
# default metric with custom metric
gbm
=
lgb
.
LGBMRegressor
(
**
params
).
fit
(
eval_metric
=
constant_metric
,
**
params_fit
)
gbm
=
lgb
.
LGBMRegressor
(
**
params
).
fit
(
eval_metric
=
constant_metric
,
**
params_fit
)
assert
len
(
gbm
.
evals_result_
[
'
training
'
])
==
2
assert
len
(
gbm
.
evals_result_
[
"
training
"
])
==
2
assert
'
l2
'
in
gbm
.
evals_result_
[
'
training
'
]
assert
"
l2
"
in
gbm
.
evals_result_
[
"
training
"
]
assert
'
error
'
in
gbm
.
evals_result_
[
'
training
'
]
assert
"
error
"
in
gbm
.
evals_result_
[
"
training
"
]
# non-default metric with custom metric
# non-default metric with custom metric
gbm
=
lgb
.
LGBMRegressor
(
metric
=
'mape'
,
gbm
=
lgb
.
LGBMRegressor
(
metric
=
"mape"
,
**
params
).
fit
(
eval_metric
=
constant_metric
,
**
params_fit
)
**
params
).
fit
(
eval_metric
=
constant_metric
,
**
params_fit
)
assert
len
(
gbm
.
evals_result_
[
"training"
])
==
2
assert
len
(
gbm
.
evals_result_
[
'training'
])
==
2
assert
"mape"
in
gbm
.
evals_result_
[
"training"
]
assert
'mape'
in
gbm
.
evals_result_
[
'training'
]
assert
"error"
in
gbm
.
evals_result_
[
"training"
]
assert
'error'
in
gbm
.
evals_result_
[
'training'
]
# multiple metrics with custom metric
# multiple metrics with custom metric
gbm
=
lgb
.
LGBMRegressor
(
metric
=
[
'l1'
,
'gamma'
],
gbm
=
lgb
.
LGBMRegressor
(
metric
=
[
"l1"
,
"gamma"
],
**
params
).
fit
(
eval_metric
=
constant_metric
,
**
params_fit
)
**
params
).
fit
(
eval_metric
=
constant_metric
,
**
params_fit
)
assert
len
(
gbm
.
evals_result_
[
"training"
])
==
3
assert
len
(
gbm
.
evals_result_
[
'training'
])
==
3
assert
"l1"
in
gbm
.
evals_result_
[
"training"
]
assert
'l1'
in
gbm
.
evals_result_
[
'training'
]
assert
"gamma"
in
gbm
.
evals_result_
[
"training"
]
assert
'gamma'
in
gbm
.
evals_result_
[
'training'
]
assert
"error"
in
gbm
.
evals_result_
[
"training"
]
assert
'error'
in
gbm
.
evals_result_
[
'training'
]
# custom metric (disable default metric)
# custom metric (disable default metric)
gbm
=
lgb
.
LGBMRegressor
(
metric
=
'None'
,
gbm
=
lgb
.
LGBMRegressor
(
metric
=
"None"
,
**
params
).
fit
(
eval_metric
=
constant_metric
,
**
params_fit
)
**
params
).
fit
(
eval_metric
=
constant_metric
,
**
params_fit
)
assert
len
(
gbm
.
evals_result_
[
"training"
])
==
1
assert
len
(
gbm
.
evals_result_
[
'training'
])
==
1
assert
"error"
in
gbm
.
evals_result_
[
"training"
]
assert
'error'
in
gbm
.
evals_result_
[
'training'
]
# default metric for non-default objective with custom metric
# default metric for non-default objective with custom metric
gbm
=
lgb
.
LGBMRegressor
(
objective
=
'regression_l1'
,
gbm
=
lgb
.
LGBMRegressor
(
objective
=
"regression_l1"
,
**
params
).
fit
(
eval_metric
=
constant_metric
,
**
params_fit
)
**
params
).
fit
(
eval_metric
=
constant_metric
,
**
params_fit
)
assert
len
(
gbm
.
evals_result_
[
"training"
])
==
2
assert
len
(
gbm
.
evals_result_
[
'training'
])
==
2
assert
"l1"
in
gbm
.
evals_result_
[
"training"
]
assert
'l1'
in
gbm
.
evals_result_
[
'training'
]
assert
"error"
in
gbm
.
evals_result_
[
"training"
]
assert
'error'
in
gbm
.
evals_result_
[
'training'
]
# non-default metric for non-default objective with custom metric
# non-default metric for non-default objective with custom metric
gbm
=
lgb
.
LGBMRegressor
(
objective
=
'regression_l1'
,
metric
=
'mape'
,
gbm
=
lgb
.
LGBMRegressor
(
objective
=
"regression_l1"
,
metric
=
"mape"
,
**
params
).
fit
(
**
params
).
fit
(
eval_metric
=
constant_metric
,
**
params_fit
)
eval_metric
=
constant_metric
,
**
params_fit
assert
len
(
gbm
.
evals_result_
[
'training'
])
==
2
)
assert
'mape'
in
gbm
.
evals_result_
[
'training'
]
assert
len
(
gbm
.
evals_result_
[
"training"
])
==
2
assert
'error'
in
gbm
.
evals_result_
[
'training'
]
assert
"mape"
in
gbm
.
evals_result_
[
"training"
]
assert
"error"
in
gbm
.
evals_result_
[
"training"
]
# multiple metrics for non-default objective with custom metric
# multiple metrics for non-default objective with custom metric
gbm
=
lgb
.
LGBMRegressor
(
objective
=
'regression_l1'
,
metric
=
[
'l1'
,
'gamma'
],
gbm
=
lgb
.
LGBMRegressor
(
objective
=
"regression_l1"
,
metric
=
[
"l1"
,
"gamma"
],
**
params
).
fit
(
**
params
).
fit
(
eval_metric
=
constant_metric
,
**
params_fit
)
eval_metric
=
constant_metric
,
**
params_fit
assert
len
(
gbm
.
evals_result_
[
'training'
])
==
3
)
assert
'l1'
in
gbm
.
evals_result_
[
'training'
]
assert
len
(
gbm
.
evals_result_
[
"training"
])
==
3
assert
'gamma'
in
gbm
.
evals_result_
[
'training'
]
assert
"l1"
in
gbm
.
evals_result_
[
"training"
]
assert
'error'
in
gbm
.
evals_result_
[
'training'
]
assert
"gamma"
in
gbm
.
evals_result_
[
"training"
]
assert
"error"
in
gbm
.
evals_result_
[
"training"
]
# custom metric (disable default metric for non-default objective)
# custom metric (disable default metric for non-default objective)
gbm
=
lgb
.
LGBMRegressor
(
objective
=
'regression_l1'
,
metric
=
'None'
,
gbm
=
lgb
.
LGBMRegressor
(
objective
=
"regression_l1"
,
metric
=
"None"
,
**
params
).
fit
(
**
params
).
fit
(
eval_metric
=
constant_metric
,
**
params_fit
)
eval_metric
=
constant_metric
,
**
params_fit
assert
len
(
gbm
.
evals_result_
[
'training'
])
==
1
)
assert
'error'
in
gbm
.
evals_result_
[
'training'
]
assert
len
(
gbm
.
evals_result_
[
"training"
])
==
1
assert
"error"
in
gbm
.
evals_result_
[
"training"
]
# custom objective, custom metric
# custom objective, custom metric
# custom metric for custom objective
# custom metric for custom objective
gbm
=
lgb
.
LGBMRegressor
(
objective
=
custom_dummy_obj
,
gbm
=
lgb
.
LGBMRegressor
(
objective
=
custom_dummy_obj
,
**
params
).
fit
(
eval_metric
=
constant_metric
,
**
params_fit
)
**
params
).
fit
(
eval_metric
=
constant_metric
,
**
params_fit
)
assert
len
(
gbm
.
evals_result_
[
"training"
])
==
2
assert
len
(
gbm
.
evals_result_
[
'training'
])
==
2
assert
"error"
in
gbm
.
evals_result_
[
"training"
]
assert
'error'
in
gbm
.
evals_result_
[
'training'
]
# non-default regression metric with custom metric for custom objective
# non-default regression metric with custom metric for custom objective
gbm
=
lgb
.
LGBMRegressor
(
objective
=
custom_dummy_obj
,
metric
=
'mape'
,
gbm
=
lgb
.
LGBMRegressor
(
objective
=
custom_dummy_obj
,
metric
=
"mape"
,
**
params
).
fit
(
**
params
).
fit
(
eval_metric
=
constant_metric
,
**
params_fit
)
eval_metric
=
constant_metric
,
**
params_fit
assert
len
(
gbm
.
evals_result_
[
'training'
])
==
2
)
assert
'mape'
in
gbm
.
evals_result_
[
'training'
]
assert
len
(
gbm
.
evals_result_
[
"training"
])
==
2
assert
'error'
in
gbm
.
evals_result_
[
'training'
]
assert
"mape"
in
gbm
.
evals_result_
[
"training"
]
assert
"error"
in
gbm
.
evals_result_
[
"training"
]
# multiple regression metrics with custom metric for custom objective
# multiple regression metrics with custom metric for custom objective
gbm
=
lgb
.
LGBMRegressor
(
objective
=
custom_dummy_obj
,
metric
=
[
'l2'
,
'mape'
],
gbm
=
lgb
.
LGBMRegressor
(
objective
=
custom_dummy_obj
,
metric
=
[
"l2"
,
"mape"
],
**
params
).
fit
(
**
params
).
fit
(
eval_metric
=
constant_metric
,
**
params_fit
)
eval_metric
=
constant_metric
,
**
params_fit
assert
len
(
gbm
.
evals_result_
[
'training'
])
==
3
)
assert
'l2'
in
gbm
.
evals_result_
[
'training'
]
assert
len
(
gbm
.
evals_result_
[
"training"
])
==
3
assert
'mape'
in
gbm
.
evals_result_
[
'training'
]
assert
"l2"
in
gbm
.
evals_result_
[
"training"
]
assert
'error'
in
gbm
.
evals_result_
[
'training'
]
assert
"mape"
in
gbm
.
evals_result_
[
"training"
]
assert
"error"
in
gbm
.
evals_result_
[
"training"
]
X
,
y
=
load_digits
(
n_class
=
3
,
return_X_y
=
True
)
X
,
y
=
load_digits
(
n_class
=
3
,
return_X_y
=
True
)
params_fit
=
{
'X'
:
X
,
'y'
:
y
,
'
eval_set
'
:
(
X
,
y
)}
params_fit
=
{
"X"
:
X
,
"y"
:
y
,
"
eval_set
"
:
(
X
,
y
)}
# default metric and invalid binary metric is replaced with multiclass alternative
# default metric and invalid binary metric is replaced with multiclass alternative
gbm
=
lgb
.
LGBMClassifier
(
**
params
).
fit
(
eval_metric
=
'
binary_error
'
,
**
params_fit
)
gbm
=
lgb
.
LGBMClassifier
(
**
params
).
fit
(
eval_metric
=
"
binary_error
"
,
**
params_fit
)
assert
len
(
gbm
.
evals_result_
[
'
training
'
])
==
2
assert
len
(
gbm
.
evals_result_
[
"
training
"
])
==
2
assert
'
multi_logloss
'
in
gbm
.
evals_result_
[
'
training
'
]
assert
"
multi_logloss
"
in
gbm
.
evals_result_
[
"
training
"
]
assert
'
multi_error
'
in
gbm
.
evals_result_
[
'
training
'
]
assert
"
multi_error
"
in
gbm
.
evals_result_
[
"
training
"
]
# invalid binary metric is replaced with multiclass alternative
# invalid binary metric is replaced with multiclass alternative
gbm
=
lgb
.
LGBMClassifier
(
**
params
).
fit
(
eval_metric
=
'
binary_error
'
,
**
params_fit
)
gbm
=
lgb
.
LGBMClassifier
(
**
params
).
fit
(
eval_metric
=
"
binary_error
"
,
**
params_fit
)
assert
gbm
.
objective_
==
'
multiclass
'
assert
gbm
.
objective_
==
"
multiclass
"
assert
len
(
gbm
.
evals_result_
[
'
training
'
])
==
2
assert
len
(
gbm
.
evals_result_
[
"
training
"
])
==
2
assert
'
multi_logloss
'
in
gbm
.
evals_result_
[
'
training
'
]
assert
"
multi_logloss
"
in
gbm
.
evals_result_
[
"
training
"
]
assert
'
multi_error
'
in
gbm
.
evals_result_
[
'
training
'
]
assert
"
multi_error
"
in
gbm
.
evals_result_
[
"
training
"
]
# default metric for non-default multiclass objective
# default metric for non-default multiclass objective
# and invalid binary metric is replaced with multiclass alternative
# and invalid binary metric is replaced with multiclass alternative
gbm
=
lgb
.
LGBMClassifier
(
objective
=
'ovr'
,
gbm
=
lgb
.
LGBMClassifier
(
objective
=
"ovr"
,
**
params
).
fit
(
eval_metric
=
"binary_error"
,
**
params_fit
)
**
params
).
fit
(
eval_metric
=
'binary_error'
,
**
params_fit
)
assert
gbm
.
objective_
==
"ovr"
assert
gbm
.
objective_
==
'ovr'
assert
len
(
gbm
.
evals_result_
[
"training"
])
==
2
assert
len
(
gbm
.
evals_result_
[
'training'
])
==
2
assert
"multi_logloss"
in
gbm
.
evals_result_
[
"training"
]
assert
'multi_logloss'
in
gbm
.
evals_result_
[
'training'
]
assert
"multi_error"
in
gbm
.
evals_result_
[
"training"
]
assert
'multi_error'
in
gbm
.
evals_result_
[
'training'
]
X
,
y
=
load_digits
(
n_class
=
2
,
return_X_y
=
True
)
X
,
y
=
load_digits
(
n_class
=
2
,
return_X_y
=
True
)
params_fit
=
{
'X'
:
X
,
'y'
:
y
,
'
eval_set
'
:
(
X
,
y
)}
params_fit
=
{
"X"
:
X
,
"y"
:
y
,
"
eval_set
"
:
(
X
,
y
)}
# default metric and invalid multiclass metric is replaced with binary alternative
# default metric and invalid multiclass metric is replaced with binary alternative
gbm
=
lgb
.
LGBMClassifier
(
**
params
).
fit
(
eval_metric
=
'
multi_error
'
,
**
params_fit
)
gbm
=
lgb
.
LGBMClassifier
(
**
params
).
fit
(
eval_metric
=
"
multi_error
"
,
**
params_fit
)
assert
len
(
gbm
.
evals_result_
[
'
training
'
])
==
2
assert
len
(
gbm
.
evals_result_
[
"
training
"
])
==
2
assert
'
binary_logloss
'
in
gbm
.
evals_result_
[
'
training
'
]
assert
"
binary_logloss
"
in
gbm
.
evals_result_
[
"
training
"
]
assert
'
binary_error
'
in
gbm
.
evals_result_
[
'
training
'
]
assert
"
binary_error
"
in
gbm
.
evals_result_
[
"
training
"
]
# invalid multiclass metric is replaced with binary alternative for custom objective
# invalid multiclass metric is replaced with binary alternative for custom objective
gbm
=
lgb
.
LGBMClassifier
(
objective
=
custom_dummy_obj
,
gbm
=
lgb
.
LGBMClassifier
(
objective
=
custom_dummy_obj
,
**
params
).
fit
(
eval_metric
=
"multi_logloss"
,
**
params_fit
)
**
params
).
fit
(
eval_metric
=
'multi_logloss'
,
**
params_fit
)
assert
len
(
gbm
.
evals_result_
[
"training"
])
==
1
assert
len
(
gbm
.
evals_result_
[
'training'
])
==
1
assert
"binary_logloss"
in
gbm
.
evals_result_
[
"training"
]
assert
'binary_logloss'
in
gbm
.
evals_result_
[
'training'
]
def
test_multiple_eval_metrics
():
def
test_multiple_eval_metrics
():
X
,
y
=
load_breast_cancer
(
return_X_y
=
True
)
X
,
y
=
load_breast_cancer
(
return_X_y
=
True
)
params
=
{
'
n_estimators
'
:
2
,
'
verbose
'
:
-
1
,
'
objective
'
:
'
binary
'
,
'
metric
'
:
'
binary_logloss
'
}
params
=
{
"
n_estimators
"
:
2
,
"
verbose
"
:
-
1
,
"
objective
"
:
"
binary
"
,
"
metric
"
:
"
binary_logloss
"
}
params_fit
=
{
'X'
:
X
,
'y'
:
y
,
'
eval_set
'
:
(
X
,
y
)}
params_fit
=
{
"X"
:
X
,
"y"
:
y
,
"
eval_set
"
:
(
X
,
y
)}
# Verify that can receive a list of metrics, only callable
# Verify that can receive a list of metrics, only callable
gbm
=
lgb
.
LGBMClassifier
(
**
params
).
fit
(
eval_metric
=
[
constant_metric
,
decreasing_metric
],
**
params_fit
)
gbm
=
lgb
.
LGBMClassifier
(
**
params
).
fit
(
eval_metric
=
[
constant_metric
,
decreasing_metric
],
**
params_fit
)
assert
len
(
gbm
.
evals_result_
[
'
training
'
])
==
3
assert
len
(
gbm
.
evals_result_
[
"
training
"
])
==
3
assert
'
error
'
in
gbm
.
evals_result_
[
'
training
'
]
assert
"
error
"
in
gbm
.
evals_result_
[
"
training
"
]
assert
'
decreasing_metric
'
in
gbm
.
evals_result_
[
'
training
'
]
assert
"
decreasing_metric
"
in
gbm
.
evals_result_
[
"
training
"
]
assert
'
binary_logloss
'
in
gbm
.
evals_result_
[
'
training
'
]
assert
"
binary_logloss
"
in
gbm
.
evals_result_
[
"
training
"
]
# Verify that can receive a list of custom and built-in metrics
# Verify that can receive a list of custom and built-in metrics
gbm
=
lgb
.
LGBMClassifier
(
**
params
).
fit
(
eval_metric
=
[
constant_metric
,
decreasing_metric
,
'
fair
'
],
**
params_fit
)
gbm
=
lgb
.
LGBMClassifier
(
**
params
).
fit
(
eval_metric
=
[
constant_metric
,
decreasing_metric
,
"
fair
"
],
**
params_fit
)
assert
len
(
gbm
.
evals_result_
[
'
training
'
])
==
4
assert
len
(
gbm
.
evals_result_
[
"
training
"
])
==
4
assert
'
error
'
in
gbm
.
evals_result_
[
'
training
'
]
assert
"
error
"
in
gbm
.
evals_result_
[
"
training
"
]
assert
'
decreasing_metric
'
in
gbm
.
evals_result_
[
'
training
'
]
assert
"
decreasing_metric
"
in
gbm
.
evals_result_
[
"
training
"
]
assert
'
binary_logloss
'
in
gbm
.
evals_result_
[
'
training
'
]
assert
"
binary_logloss
"
in
gbm
.
evals_result_
[
"
training
"
]
assert
'
fair
'
in
gbm
.
evals_result_
[
'
training
'
]
assert
"
fair
"
in
gbm
.
evals_result_
[
"
training
"
]
# Verify that works as expected when eval_metric is empty
# Verify that works as expected when eval_metric is empty
gbm
=
lgb
.
LGBMClassifier
(
**
params
).
fit
(
eval_metric
=
[],
**
params_fit
)
gbm
=
lgb
.
LGBMClassifier
(
**
params
).
fit
(
eval_metric
=
[],
**
params_fit
)
assert
len
(
gbm
.
evals_result_
[
'
training
'
])
==
1
assert
len
(
gbm
.
evals_result_
[
"
training
"
])
==
1
assert
'
binary_logloss
'
in
gbm
.
evals_result_
[
'
training
'
]
assert
"
binary_logloss
"
in
gbm
.
evals_result_
[
"
training
"
]
# Verify that can receive a list of metrics, only built-in
# Verify that can receive a list of metrics, only built-in
gbm
=
lgb
.
LGBMClassifier
(
**
params
).
fit
(
eval_metric
=
[
'
fair
'
,
'
error
'
],
**
params_fit
)
gbm
=
lgb
.
LGBMClassifier
(
**
params
).
fit
(
eval_metric
=
[
"
fair
"
,
"
error
"
],
**
params_fit
)
assert
len
(
gbm
.
evals_result_
[
'
training
'
])
==
3
assert
len
(
gbm
.
evals_result_
[
"
training
"
])
==
3
assert
'
binary_logloss
'
in
gbm
.
evals_result_
[
'
training
'
]
assert
"
binary_logloss
"
in
gbm
.
evals_result_
[
"
training
"
]
# Verify that eval_metric is robust to receiving a list with None
# Verify that eval_metric is robust to receiving a list with None
gbm
=
lgb
.
LGBMClassifier
(
**
params
).
fit
(
eval_metric
=
[
'
fair
'
,
'
error
'
,
None
],
**
params_fit
)
gbm
=
lgb
.
LGBMClassifier
(
**
params
).
fit
(
eval_metric
=
[
"
fair
"
,
"
error
"
,
None
],
**
params_fit
)
assert
len
(
gbm
.
evals_result_
[
'
training
'
])
==
3
assert
len
(
gbm
.
evals_result_
[
"
training
"
])
==
3
assert
'
binary_logloss
'
in
gbm
.
evals_result_
[
'
training
'
]
assert
"
binary_logloss
"
in
gbm
.
evals_result_
[
"
training
"
]
def
test_nan_handle
():
def
test_nan_handle
():
...
@@ -1104,18 +1079,18 @@ def test_nan_handle():
...
@@ -1104,18 +1079,18 @@ def test_nan_handle():
X
=
np
.
random
.
randn
(
nrows
,
ncols
)
X
=
np
.
random
.
randn
(
nrows
,
ncols
)
y
=
np
.
random
.
randn
(
nrows
)
+
np
.
full
(
nrows
,
1e30
)
y
=
np
.
random
.
randn
(
nrows
)
+
np
.
full
(
nrows
,
1e30
)
weight
=
np
.
zeros
(
nrows
)
weight
=
np
.
zeros
(
nrows
)
params
=
{
'n_estimators'
:
20
,
'verbose'
:
-
1
}
params
=
{
"n_estimators"
:
20
,
"verbose"
:
-
1
}
params_fit
=
{
'X'
:
X
,
'y'
:
y
,
'sample_weight'
:
weight
,
'eval_set'
:
(
X
,
y
),
params_fit
=
{
"X"
:
X
,
"y"
:
y
,
"sample_weight"
:
weight
,
"eval_set"
:
(
X
,
y
),
"callbacks"
:
[
lgb
.
early_stopping
(
5
)]}
'callbacks'
:
[
lgb
.
early_stopping
(
5
)]}
gbm
=
lgb
.
LGBMRegressor
(
**
params
).
fit
(
**
params_fit
)
gbm
=
lgb
.
LGBMRegressor
(
**
params
).
fit
(
**
params_fit
)
np
.
testing
.
assert_allclose
(
gbm
.
evals_result_
[
'
training
'
][
'
l2
'
],
np
.
nan
)
np
.
testing
.
assert_allclose
(
gbm
.
evals_result_
[
"
training
"
][
"
l2
"
],
np
.
nan
)
@
pytest
.
mark
.
skipif
(
getenv
(
'TASK'
,
''
)
==
'cuda'
,
reason
=
'Skip due to differences in implementation details of CUDA version'
)
@
pytest
.
mark
.
skipif
(
getenv
(
"TASK"
,
""
)
==
"cuda"
,
reason
=
"Skip due to differences in implementation details of CUDA version"
)
def
test_first_metric_only
():
def
test_first_metric_only
():
def
fit_and_check
(
eval_set_names
,
metric_names
,
assumed_iteration
,
first_metric_only
):
def
fit_and_check
(
eval_set_names
,
metric_names
,
assumed_iteration
,
first_metric_only
):
params
[
'
first_metric_only
'
]
=
first_metric_only
params
[
"
first_metric_only
"
]
=
first_metric_only
gbm
=
lgb
.
LGBMRegressor
(
**
params
).
fit
(
**
params_fit
)
gbm
=
lgb
.
LGBMRegressor
(
**
params
).
fit
(
**
params_fit
)
assert
len
(
gbm
.
evals_result_
)
==
len
(
eval_set_names
)
assert
len
(
gbm
.
evals_result_
)
==
len
(
eval_set_names
)
for
eval_set_name
in
eval_set_names
:
for
eval_set_name
in
eval_set_names
:
...
@@ -1125,11 +1100,13 @@ def test_first_metric_only():
...
@@ -1125,11 +1100,13 @@ def test_first_metric_only():
assert
metric_name
in
gbm
.
evals_result_
[
eval_set_name
]
assert
metric_name
in
gbm
.
evals_result_
[
eval_set_name
]
actual
=
len
(
gbm
.
evals_result_
[
eval_set_name
][
metric_name
])
actual
=
len
(
gbm
.
evals_result_
[
eval_set_name
][
metric_name
])
expected
=
assumed_iteration
+
(
params
[
'early_stopping_rounds'
]
expected
=
assumed_iteration
+
(
if
eval_set_name
!=
'training'
params
[
"early_stopping_rounds"
]
and
assumed_iteration
!=
gbm
.
n_estimators
else
0
)
if
eval_set_name
!=
"training"
and
assumed_iteration
!=
gbm
.
n_estimators
else
0
)
assert
expected
==
actual
assert
expected
==
actual
if
eval_set_name
!=
'
training
'
:
if
eval_set_name
!=
"
training
"
:
assert
assumed_iteration
==
gbm
.
best_iteration_
assert
assumed_iteration
==
gbm
.
best_iteration_
else
:
else
:
assert
gbm
.
n_estimators
==
gbm
.
best_iteration_
assert
gbm
.
n_estimators
==
gbm
.
best_iteration_
...
@@ -1137,14 +1114,15 @@ def test_first_metric_only():
...
@@ -1137,14 +1114,15 @@ def test_first_metric_only():
X
,
y
=
make_synthetic_regression
(
n_samples
=
300
)
X
,
y
=
make_synthetic_regression
(
n_samples
=
300
)
X_train
,
X_test
,
y_train
,
y_test
=
train_test_split
(
X
,
y
,
test_size
=
0.2
,
random_state
=
42
)
X_train
,
X_test
,
y_train
,
y_test
=
train_test_split
(
X
,
y
,
test_size
=
0.2
,
random_state
=
42
)
X_test1
,
X_test2
,
y_test1
,
y_test2
=
train_test_split
(
X_test
,
y_test
,
test_size
=
0.5
,
random_state
=
72
)
X_test1
,
X_test2
,
y_test1
,
y_test2
=
train_test_split
(
X_test
,
y_test
,
test_size
=
0.5
,
random_state
=
72
)
params
=
{
'n_estimators'
:
30
,
params
=
{
'learning_rate'
:
0.8
,
"n_estimators"
:
30
,
'num_leaves'
:
15
,
"learning_rate"
:
0.8
,
'verbose'
:
-
1
,
"num_leaves"
:
15
,
'seed'
:
123
,
"verbose"
:
-
1
,
'early_stopping_rounds'
:
5
}
# early stop should be supported via global LightGBM parameter
"seed"
:
123
,
params_fit
=
{
'X'
:
X_train
,
"early_stopping_rounds"
:
5
,
'y'
:
y_train
}
}
# early stop should be supported via global LightGBM parameter
params_fit
=
{
"X"
:
X_train
,
"y"
:
y_train
}
iter_valid1_l1
=
4
iter_valid1_l1
=
4
iter_valid1_l2
=
4
iter_valid1_l2
=
4
...
@@ -1157,100 +1135,116 @@ def test_first_metric_only():
...
@@ -1157,100 +1135,116 @@ def test_first_metric_only():
iter_min_valid1
=
min
([
iter_valid1_l1
,
iter_valid1_l2
])
iter_min_valid1
=
min
([
iter_valid1_l1
,
iter_valid1_l2
])
# feval
# feval
params
[
'metric'
]
=
'None'
params
[
"metric"
]
=
"None"
params_fit
[
'eval_metric'
]
=
lambda
preds
,
train_data
:
[
decreasing_metric
(
preds
,
train_data
),
params_fit
[
"eval_metric"
]
=
lambda
preds
,
train_data
:
[
constant_metric
(
preds
,
train_data
)]
decreasing_metric
(
preds
,
train_data
),
params_fit
[
'eval_set'
]
=
(
X_test1
,
y_test1
)
constant_metric
(
preds
,
train_data
),
fit_and_check
([
'valid_0'
],
[
'decreasing_metric'
,
'error'
],
1
,
False
)
]
fit_and_check
([
'valid_0'
],
[
'decreasing_metric'
,
'error'
],
30
,
True
)
params_fit
[
"eval_set"
]
=
(
X_test1
,
y_test1
)
params_fit
[
'eval_metric'
]
=
lambda
preds
,
train_data
:
[
constant_metric
(
preds
,
train_data
),
fit_and_check
([
"valid_0"
],
[
"decreasing_metric"
,
"error"
],
1
,
False
)
decreasing_metric
(
preds
,
train_data
)]
fit_and_check
([
"valid_0"
],
[
"decreasing_metric"
,
"error"
],
30
,
True
)
fit_and_check
([
'valid_0'
],
[
'decreasing_metric'
,
'error'
],
1
,
True
)
params_fit
[
"eval_metric"
]
=
lambda
preds
,
train_data
:
[
constant_metric
(
preds
,
train_data
),
decreasing_metric
(
preds
,
train_data
),
]
fit_and_check
([
"valid_0"
],
[
"decreasing_metric"
,
"error"
],
1
,
True
)
# single eval_set
# single eval_set
params
.
pop
(
'
metric
'
)
params
.
pop
(
"
metric
"
)
params_fit
.
pop
(
'
eval_metric
'
)
params_fit
.
pop
(
"
eval_metric
"
)
fit_and_check
([
'
valid_0
'
],
[
'
l2
'
],
iter_valid1_l2
,
False
)
fit_and_check
([
"
valid_0
"
],
[
"
l2
"
],
iter_valid1_l2
,
False
)
fit_and_check
([
'
valid_0
'
],
[
'
l2
'
],
iter_valid1_l2
,
True
)
fit_and_check
([
"
valid_0
"
],
[
"
l2
"
],
iter_valid1_l2
,
True
)
params_fit
[
'
eval_metric
'
]
=
"l2"
params_fit
[
"
eval_metric
"
]
=
"l2"
fit_and_check
([
'
valid_0
'
],
[
'
l2
'
],
iter_valid1_l2
,
False
)
fit_and_check
([
"
valid_0
"
],
[
"
l2
"
],
iter_valid1_l2
,
False
)
fit_and_check
([
'
valid_0
'
],
[
'
l2
'
],
iter_valid1_l2
,
True
)
fit_and_check
([
"
valid_0
"
],
[
"
l2
"
],
iter_valid1_l2
,
True
)
params_fit
[
'
eval_metric
'
]
=
"l1"
params_fit
[
"
eval_metric
"
]
=
"l1"
fit_and_check
([
'
valid_0
'
],
[
'
l1
'
,
'
l2
'
],
iter_min_valid1
,
False
)
fit_and_check
([
"
valid_0
"
],
[
"
l1
"
,
"
l2
"
],
iter_min_valid1
,
False
)
fit_and_check
([
'
valid_0
'
],
[
'
l1
'
,
'
l2
'
],
iter_valid1_l1
,
True
)
fit_and_check
([
"
valid_0
"
],
[
"
l1
"
,
"
l2
"
],
iter_valid1_l1
,
True
)
params_fit
[
'
eval_metric
'
]
=
[
"l1"
,
"l2"
]
params_fit
[
"
eval_metric
"
]
=
[
"l1"
,
"l2"
]
fit_and_check
([
'
valid_0
'
],
[
'
l1
'
,
'
l2
'
],
iter_min_valid1
,
False
)
fit_and_check
([
"
valid_0
"
],
[
"
l1
"
,
"
l2
"
],
iter_min_valid1
,
False
)
fit_and_check
([
'
valid_0
'
],
[
'
l1
'
,
'
l2
'
],
iter_valid1_l1
,
True
)
fit_and_check
([
"
valid_0
"
],
[
"
l1
"
,
"
l2
"
],
iter_valid1_l1
,
True
)
params_fit
[
'
eval_metric
'
]
=
[
"l2"
,
"l1"
]
params_fit
[
"
eval_metric
"
]
=
[
"l2"
,
"l1"
]
fit_and_check
([
'
valid_0
'
],
[
'
l1
'
,
'
l2
'
],
iter_min_valid1
,
False
)
fit_and_check
([
"
valid_0
"
],
[
"
l1
"
,
"
l2
"
],
iter_min_valid1
,
False
)
fit_and_check
([
'
valid_0
'
],
[
'
l1
'
,
'
l2
'
],
iter_valid1_l2
,
True
)
fit_and_check
([
"
valid_0
"
],
[
"
l1
"
,
"
l2
"
],
iter_valid1_l2
,
True
)
params_fit
[
'
eval_metric
'
]
=
[
"l2"
,
"regression"
,
"mse"
]
# test aliases
params_fit
[
"
eval_metric
"
]
=
[
"l2"
,
"regression"
,
"mse"
]
# test aliases
fit_and_check
([
'
valid_0
'
],
[
'
l2
'
],
iter_valid1_l2
,
False
)
fit_and_check
([
"
valid_0
"
],
[
"
l2
"
],
iter_valid1_l2
,
False
)
fit_and_check
([
'
valid_0
'
],
[
'
l2
'
],
iter_valid1_l2
,
True
)
fit_and_check
([
"
valid_0
"
],
[
"
l2
"
],
iter_valid1_l2
,
True
)
# two eval_set
# two eval_set
params_fit
[
'
eval_set
'
]
=
[(
X_test1
,
y_test1
),
(
X_test2
,
y_test2
)]
params_fit
[
"
eval_set
"
]
=
[(
X_test1
,
y_test1
),
(
X_test2
,
y_test2
)]
params_fit
[
'
eval_metric
'
]
=
[
"l1"
,
"l2"
]
params_fit
[
"
eval_metric
"
]
=
[
"l1"
,
"l2"
]
fit_and_check
([
'
valid_0
'
,
'
valid_1
'
],
[
'
l1
'
,
'
l2
'
],
iter_min_l1
,
True
)
fit_and_check
([
"
valid_0
"
,
"
valid_1
"
],
[
"
l1
"
,
"
l2
"
],
iter_min_l1
,
True
)
params_fit
[
'
eval_metric
'
]
=
[
"l2"
,
"l1"
]
params_fit
[
"
eval_metric
"
]
=
[
"l2"
,
"l1"
]
fit_and_check
([
'
valid_0
'
,
'
valid_1
'
],
[
'
l1
'
,
'
l2
'
],
iter_min_l2
,
True
)
fit_and_check
([
"
valid_0
"
,
"
valid_1
"
],
[
"
l1
"
,
"
l2
"
],
iter_min_l2
,
True
)
params_fit
[
'
eval_set
'
]
=
[(
X_test2
,
y_test2
),
(
X_test1
,
y_test1
)]
params_fit
[
"
eval_set
"
]
=
[(
X_test2
,
y_test2
),
(
X_test1
,
y_test1
)]
params_fit
[
'
eval_metric
'
]
=
[
"l1"
,
"l2"
]
params_fit
[
"
eval_metric
"
]
=
[
"l1"
,
"l2"
]
fit_and_check
([
'
valid_0
'
,
'
valid_1
'
],
[
'
l1
'
,
'
l2
'
],
iter_min
,
False
)
fit_and_check
([
"
valid_0
"
,
"
valid_1
"
],
[
"
l1
"
,
"
l2
"
],
iter_min
,
False
)
fit_and_check
([
'
valid_0
'
,
'
valid_1
'
],
[
'
l1
'
,
'
l2
'
],
iter_min_l1
,
True
)
fit_and_check
([
"
valid_0
"
,
"
valid_1
"
],
[
"
l1
"
,
"
l2
"
],
iter_min_l1
,
True
)
params_fit
[
'
eval_metric
'
]
=
[
"l2"
,
"l1"
]
params_fit
[
"
eval_metric
"
]
=
[
"l2"
,
"l1"
]
fit_and_check
([
'
valid_0
'
,
'
valid_1
'
],
[
'
l1
'
,
'
l2
'
],
iter_min
,
False
)
fit_and_check
([
"
valid_0
"
,
"
valid_1
"
],
[
"
l1
"
,
"
l2
"
],
iter_min
,
False
)
fit_and_check
([
'
valid_0
'
,
'
valid_1
'
],
[
'
l1
'
,
'
l2
'
],
iter_min_l2
,
True
)
fit_and_check
([
"
valid_0
"
,
"
valid_1
"
],
[
"
l1
"
,
"
l2
"
],
iter_min_l2
,
True
)
def
test_class_weight
():
def
test_class_weight
():
X
,
y
=
load_digits
(
n_class
=
10
,
return_X_y
=
True
)
X
,
y
=
load_digits
(
n_class
=
10
,
return_X_y
=
True
)
X_train
,
X_test
,
y_train
,
y_test
=
train_test_split
(
X
,
y
,
test_size
=
0.2
,
random_state
=
42
)
X_train
,
X_test
,
y_train
,
y_test
=
train_test_split
(
X
,
y
,
test_size
=
0.2
,
random_state
=
42
)
y_train_str
=
y_train
.
astype
(
'str'
)
y_train_str
=
y_train
.
astype
(
"str"
)
y_test_str
=
y_test
.
astype
(
'str'
)
y_test_str
=
y_test
.
astype
(
"str"
)
gbm
=
lgb
.
LGBMClassifier
(
n_estimators
=
10
,
class_weight
=
'balanced'
,
verbose
=-
1
)
gbm
=
lgb
.
LGBMClassifier
(
n_estimators
=
10
,
class_weight
=
"balanced"
,
verbose
=-
1
)
gbm
.
fit
(
X_train
,
y_train
,
gbm
.
fit
(
eval_set
=
[(
X_train
,
y_train
),
(
X_test
,
y_test
),
(
X_test
,
y_test
),
X_train
,
(
X_test
,
y_test
),
(
X_test
,
y_test
)],
y_train
,
eval_class_weight
=
[
'balanced'
,
None
,
'balanced'
,
{
1
:
10
,
4
:
20
},
{
5
:
30
,
2
:
40
}])
eval_set
=
[(
X_train
,
y_train
),
(
X_test
,
y_test
),
(
X_test
,
y_test
),
(
X_test
,
y_test
),
(
X_test
,
y_test
)],
eval_class_weight
=
[
"balanced"
,
None
,
"balanced"
,
{
1
:
10
,
4
:
20
},
{
5
:
30
,
2
:
40
}],
)
for
eval_set1
,
eval_set2
in
itertools
.
combinations
(
gbm
.
evals_result_
.
keys
(),
2
):
for
eval_set1
,
eval_set2
in
itertools
.
combinations
(
gbm
.
evals_result_
.
keys
(),
2
):
for
metric
in
gbm
.
evals_result_
[
eval_set1
]:
for
metric
in
gbm
.
evals_result_
[
eval_set1
]:
np
.
testing
.
assert_raises
(
AssertionError
,
np
.
testing
.
assert_raises
(
np
.
testing
.
assert_allclose
,
AssertionError
,
gbm
.
evals_result_
[
eval_set1
][
metric
],
np
.
testing
.
assert_allclose
,
gbm
.
evals_result_
[
eval_set2
][
metric
])
gbm
.
evals_result_
[
eval_set1
][
metric
],
gbm_str
=
lgb
.
LGBMClassifier
(
n_estimators
=
10
,
class_weight
=
'balanced'
,
verbose
=-
1
)
gbm
.
evals_result_
[
eval_set2
][
metric
],
gbm_str
.
fit
(
X_train
,
y_train_str
,
)
eval_set
=
[(
X_train
,
y_train_str
),
(
X_test
,
y_test_str
),
gbm_str
=
lgb
.
LGBMClassifier
(
n_estimators
=
10
,
class_weight
=
"balanced"
,
verbose
=-
1
)
(
X_test
,
y_test_str
),
(
X_test
,
y_test_str
),
(
X_test
,
y_test_str
)],
gbm_str
.
fit
(
eval_class_weight
=
[
'balanced'
,
None
,
'balanced'
,
{
'1'
:
10
,
'4'
:
20
},
{
'5'
:
30
,
'2'
:
40
}])
X_train
,
y_train_str
,
eval_set
=
[
(
X_train
,
y_train_str
),
(
X_test
,
y_test_str
),
(
X_test
,
y_test_str
),
(
X_test
,
y_test_str
),
(
X_test
,
y_test_str
),
],
eval_class_weight
=
[
"balanced"
,
None
,
"balanced"
,
{
"1"
:
10
,
"4"
:
20
},
{
"5"
:
30
,
"2"
:
40
}],
)
for
eval_set1
,
eval_set2
in
itertools
.
combinations
(
gbm_str
.
evals_result_
.
keys
(),
2
):
for
eval_set1
,
eval_set2
in
itertools
.
combinations
(
gbm_str
.
evals_result_
.
keys
(),
2
):
for
metric
in
gbm_str
.
evals_result_
[
eval_set1
]:
for
metric
in
gbm_str
.
evals_result_
[
eval_set1
]:
np
.
testing
.
assert_raises
(
AssertionError
,
np
.
testing
.
assert_raises
(
np
.
testing
.
assert_allclose
,
AssertionError
,
gbm_str
.
evals_result_
[
eval_set1
][
metric
],
np
.
testing
.
assert_allclose
,
gbm_str
.
evals_result_
[
eval_set2
][
metric
])
gbm_str
.
evals_result_
[
eval_set1
][
metric
],
gbm_str
.
evals_result_
[
eval_set2
][
metric
],
)
for
eval_set
in
gbm
.
evals_result_
:
for
eval_set
in
gbm
.
evals_result_
:
for
metric
in
gbm
.
evals_result_
[
eval_set
]:
for
metric
in
gbm
.
evals_result_
[
eval_set
]:
np
.
testing
.
assert_allclose
(
gbm
.
evals_result_
[
eval_set
][
metric
],
np
.
testing
.
assert_allclose
(
gbm
.
evals_result_
[
eval_set
][
metric
],
gbm_str
.
evals_result_
[
eval_set
][
metric
])
gbm_str
.
evals_result_
[
eval_set
][
metric
])
def
test_continue_training_with_model
():
def
test_continue_training_with_model
():
X
,
y
=
load_digits
(
n_class
=
3
,
return_X_y
=
True
)
X
,
y
=
load_digits
(
n_class
=
3
,
return_X_y
=
True
)
X_train
,
X_test
,
y_train
,
y_test
=
train_test_split
(
X
,
y
,
test_size
=
0.1
,
random_state
=
42
)
X_train
,
X_test
,
y_train
,
y_test
=
train_test_split
(
X
,
y
,
test_size
=
0.1
,
random_state
=
42
)
init_gbm
=
lgb
.
LGBMClassifier
(
n_estimators
=
5
).
fit
(
X_train
,
y_train
,
eval_set
=
(
X_test
,
y_test
))
init_gbm
=
lgb
.
LGBMClassifier
(
n_estimators
=
5
).
fit
(
X_train
,
y_train
,
eval_set
=
(
X_test
,
y_test
))
gbm
=
lgb
.
LGBMClassifier
(
n_estimators
=
5
).
fit
(
X_train
,
y_train
,
eval_set
=
(
X_test
,
y_test
),
gbm
=
lgb
.
LGBMClassifier
(
n_estimators
=
5
).
fit
(
X_train
,
y_train
,
eval_set
=
(
X_test
,
y_test
),
init_model
=
init_gbm
)
init_model
=
init_gbm
)
assert
len
(
init_gbm
.
evals_result_
[
"valid_0"
][
"multi_logloss"
])
==
len
(
gbm
.
evals_result_
[
"valid_0"
][
"multi_logloss"
])
assert
len
(
init_gbm
.
evals_result_
[
'valid_0'
][
'multi_logloss'
])
==
len
(
gbm
.
evals_result_
[
'valid_0'
][
'multi_logloss'
])
assert
len
(
init_gbm
.
evals_result_
[
"valid_0"
][
"multi_logloss"
])
==
5
assert
len
(
init_gbm
.
evals_result_
[
'valid_0'
][
'multi_logloss'
])
==
5
assert
gbm
.
evals_result_
[
"valid_0"
][
"multi_logloss"
][
-
1
]
<
init_gbm
.
evals_result_
[
"valid_0"
][
"multi_logloss"
][
-
1
]
assert
gbm
.
evals_result_
[
'valid_0'
][
'multi_logloss'
][
-
1
]
<
init_gbm
.
evals_result_
[
'valid_0'
][
'multi_logloss'
][
-
1
]
def
test_actual_number_of_trees
():
def
test_actual_number_of_trees
():
...
@@ -1288,20 +1282,16 @@ def test_sklearn_integration(estimator, check):
...
@@ -1288,20 +1282,16 @@ def test_sklearn_integration(estimator, check):
check
(
estimator
)
check
(
estimator
)
@
pytest
.
mark
.
parametrize
(
'
task
'
,
[
'
binary-classification
'
,
'
multiclass-classification
'
,
'
ranking
'
,
'
regression
'
])
@
pytest
.
mark
.
parametrize
(
"
task
"
,
[
"
binary-classification
"
,
"
multiclass-classification
"
,
"
ranking
"
,
"
regression
"
])
def
test_training_succeeds_when_data_is_dataframe_and_label_is_column_array
(
task
):
def
test_training_succeeds_when_data_is_dataframe_and_label_is_column_array
(
task
):
pd
=
pytest
.
importorskip
(
"pandas"
)
pd
=
pytest
.
importorskip
(
"pandas"
)
X
,
y
,
g
=
_create_data
(
task
)
X
,
y
,
g
=
_create_data
(
task
)
X
=
pd
.
DataFrame
(
X
)
X
=
pd
.
DataFrame
(
X
)
y_col_array
=
y
.
reshape
(
-
1
,
1
)
y_col_array
=
y
.
reshape
(
-
1
,
1
)
params
=
{
params
=
{
"n_estimators"
:
1
,
"num_leaves"
:
3
,
"random_state"
:
0
}
'n_estimators'
:
1
,
'num_leaves'
:
3
,
'random_state'
:
0
}
model_factory
=
task_to_model_factory
[
task
]
model_factory
=
task_to_model_factory
[
task
]
with
pytest
.
warns
(
UserWarning
,
match
=
'
column-vector
'
):
with
pytest
.
warns
(
UserWarning
,
match
=
"
column-vector
"
):
if
task
==
'
ranking
'
:
if
task
==
"
ranking
"
:
model_1d
=
model_factory
(
**
params
).
fit
(
X
,
y
,
group
=
g
)
model_1d
=
model_factory
(
**
params
).
fit
(
X
,
y
,
group
=
g
)
model_2d
=
model_factory
(
**
params
).
fit
(
X
,
y_col_array
,
group
=
g
)
model_2d
=
model_factory
(
**
params
).
fit
(
X
,
y_col_array
,
group
=
g
)
else
:
else
:
...
@@ -1313,12 +1303,12 @@ def test_training_succeeds_when_data_is_dataframe_and_label_is_column_array(task
...
@@ -1313,12 +1303,12 @@ def test_training_succeeds_when_data_is_dataframe_and_label_is_column_array(task
np
.
testing
.
assert_array_equal
(
preds_1d
,
preds_2d
)
np
.
testing
.
assert_array_equal
(
preds_1d
,
preds_2d
)
@
pytest
.
mark
.
parametrize
(
'
use_weight
'
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"
use_weight
"
,
[
True
,
False
])
def
test_multiclass_custom_objective
(
use_weight
):
def
test_multiclass_custom_objective
(
use_weight
):
centers
=
[[
-
4
,
-
4
],
[
4
,
4
],
[
-
4
,
4
]]
centers
=
[[
-
4
,
-
4
],
[
4
,
4
],
[
-
4
,
4
]]
X
,
y
=
make_blobs
(
n_samples
=
1_000
,
centers
=
centers
,
random_state
=
42
)
X
,
y
=
make_blobs
(
n_samples
=
1_000
,
centers
=
centers
,
random_state
=
42
)
weight
=
np
.
full_like
(
y
,
2
)
if
use_weight
else
None
weight
=
np
.
full_like
(
y
,
2
)
if
use_weight
else
None
params
=
{
'
n_estimators
'
:
10
,
'
num_leaves
'
:
7
}
params
=
{
"
n_estimators
"
:
10
,
"
num_leaves
"
:
7
}
builtin_obj_model
=
lgb
.
LGBMClassifier
(
**
params
)
builtin_obj_model
=
lgb
.
LGBMClassifier
(
**
params
)
builtin_obj_model
.
fit
(
X
,
y
,
sample_weight
=
weight
)
builtin_obj_model
.
fit
(
X
,
y
,
sample_weight
=
weight
)
builtin_obj_preds
=
builtin_obj_model
.
predict_proba
(
X
)
builtin_obj_preds
=
builtin_obj_model
.
predict_proba
(
X
)
...
@@ -1332,11 +1322,11 @@ def test_multiclass_custom_objective(use_weight):
...
@@ -1332,11 +1322,11 @@ def test_multiclass_custom_objective(use_weight):
assert
callable
(
custom_obj_model
.
objective_
)
assert
callable
(
custom_obj_model
.
objective_
)
@
pytest
.
mark
.
parametrize
(
'
use_weight
'
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"
use_weight
"
,
[
True
,
False
])
def
test_multiclass_custom_eval
(
use_weight
):
def
test_multiclass_custom_eval
(
use_weight
):
def
custom_eval
(
y_true
,
y_pred
,
weight
):
def
custom_eval
(
y_true
,
y_pred
,
weight
):
loss
=
log_loss
(
y_true
,
y_pred
,
sample_weight
=
weight
)
loss
=
log_loss
(
y_true
,
y_pred
,
sample_weight
=
weight
)
return
'
custom_logloss
'
,
loss
,
False
return
"
custom_logloss
"
,
loss
,
False
centers
=
[[
-
4
,
-
4
],
[
4
,
4
],
[
-
4
,
4
]]
centers
=
[[
-
4
,
-
4
],
[
4
,
4
],
[
-
4
,
4
]]
X
,
y
=
make_blobs
(
n_samples
=
1_000
,
centers
=
centers
,
random_state
=
42
)
X
,
y
=
make_blobs
(
n_samples
=
1_000
,
centers
=
centers
,
random_state
=
42
)
...
@@ -1348,27 +1338,25 @@ def test_multiclass_custom_eval(use_weight):
...
@@ -1348,27 +1338,25 @@ def test_multiclass_custom_eval(use_weight):
else
:
else
:
weight_train
=
None
weight_train
=
None
weight_valid
=
None
weight_valid
=
None
params
=
{
'
objective
'
:
'
multiclass
'
,
'
num_class
'
:
3
,
'
num_leaves
'
:
7
}
params
=
{
"
objective
"
:
"
multiclass
"
,
"
num_class
"
:
3
,
"
num_leaves
"
:
7
}
model
=
lgb
.
LGBMClassifier
(
**
params
)
model
=
lgb
.
LGBMClassifier
(
**
params
)
model
.
fit
(
model
.
fit
(
X_train
,
X_train
,
y_train
,
y_train
,
sample_weight
=
weight_train
,
sample_weight
=
weight_train
,
eval_set
=
[(
X_train
,
y_train
),
(
X_valid
,
y_valid
)],
eval_set
=
[(
X_train
,
y_train
),
(
X_valid
,
y_valid
)],
eval_names
=
[
'
train
'
,
'
valid
'
],
eval_names
=
[
"
train
"
,
"
valid
"
],
eval_sample_weight
=
[
weight_train
,
weight_valid
],
eval_sample_weight
=
[
weight_train
,
weight_valid
],
eval_metric
=
custom_eval
,
eval_metric
=
custom_eval
,
)
)
eval_result
=
model
.
evals_result_
eval_result
=
model
.
evals_result_
train_ds
=
(
X_train
,
y_train
,
weight_train
)
train_ds
=
(
X_train
,
y_train
,
weight_train
)
valid_ds
=
(
X_valid
,
y_valid
,
weight_valid
)
valid_ds
=
(
X_valid
,
y_valid
,
weight_valid
)
for
key
,
(
X
,
y_true
,
weight
)
in
zip
([
'train'
,
'valid'
],
[
train_ds
,
valid_ds
]):
for
key
,
(
X
,
y_true
,
weight
)
in
zip
([
"train"
,
"valid"
],
[
train_ds
,
valid_ds
]):
np
.
testing
.
assert_allclose
(
np
.
testing
.
assert_allclose
(
eval_result
[
key
][
"multi_logloss"
],
eval_result
[
key
][
"custom_logloss"
])
eval_result
[
key
][
'multi_logloss'
],
eval_result
[
key
][
'custom_logloss'
]
)
y_pred
=
model
.
predict_proba
(
X
)
y_pred
=
model
.
predict_proba
(
X
)
_
,
metric_value
,
_
=
custom_eval
(
y_true
,
y_pred
,
weight
)
_
,
metric_value
,
_
=
custom_eval
(
y_true
,
y_pred
,
weight
)
np
.
testing
.
assert_allclose
(
metric_value
,
eval_result
[
key
][
'
custom_logloss
'
][
-
1
])
np
.
testing
.
assert_allclose
(
metric_value
,
eval_result
[
key
][
"
custom_logloss
"
][
-
1
])
def
test_negative_n_jobs
(
tmp_path
):
def
test_negative_n_jobs
(
tmp_path
):
...
@@ -1397,21 +1385,21 @@ def test_default_n_jobs(tmp_path):
...
@@ -1397,21 +1385,21 @@ def test_default_n_jobs(tmp_path):
assert
bool
(
re
.
search
(
rf
"\[num_threads:
{
n_cores
}
\]"
,
model_txt
))
assert
bool
(
re
.
search
(
rf
"\[num_threads:
{
n_cores
}
\]"
,
model_txt
))
@
pytest
.
mark
.
skipif
(
not
PANDAS_INSTALLED
,
reason
=
'
pandas is not installed
'
)
@
pytest
.
mark
.
skipif
(
not
PANDAS_INSTALLED
,
reason
=
"
pandas is not installed
"
)
@
pytest
.
mark
.
parametrize
(
'
task
'
,
[
'
binary-classification
'
,
'
multiclass-classification
'
,
'
ranking
'
,
'
regression
'
])
@
pytest
.
mark
.
parametrize
(
"
task
"
,
[
"
binary-classification
"
,
"
multiclass-classification
"
,
"
ranking
"
,
"
regression
"
])
def
test_validate_features
(
task
):
def
test_validate_features
(
task
):
X
,
y
,
g
=
_create_data
(
task
,
n_features
=
4
)
X
,
y
,
g
=
_create_data
(
task
,
n_features
=
4
)
features
=
[
'
x1
'
,
'
x2
'
,
'
x3
'
,
'
x4
'
]
features
=
[
"
x1
"
,
"
x2
"
,
"
x3
"
,
"
x4
"
]
df
=
pd_DataFrame
(
X
,
columns
=
features
)
df
=
pd_DataFrame
(
X
,
columns
=
features
)
model
=
task_to_model_factory
[
task
](
n_estimators
=
10
,
num_leaves
=
15
,
verbose
=-
1
)
model
=
task_to_model_factory
[
task
](
n_estimators
=
10
,
num_leaves
=
15
,
verbose
=-
1
)
if
task
==
'
ranking
'
:
if
task
==
"
ranking
"
:
model
.
fit
(
df
,
y
,
group
=
g
)
model
.
fit
(
df
,
y
,
group
=
g
)
else
:
else
:
model
.
fit
(
df
,
y
)
model
.
fit
(
df
,
y
)
assert
model
.
feature_name_
==
features
assert
model
.
feature_name_
==
features
# try to predict with a different feature
# try to predict with a different feature
df2
=
df
.
rename
(
columns
=
{
'
x2
'
:
'z'
})
df2
=
df
.
rename
(
columns
=
{
"
x2
"
:
"z"
})
with
pytest
.
raises
(
lgb
.
basic
.
LightGBMError
,
match
=
"Expected 'x2' at position 1 but found 'z'"
):
with
pytest
.
raises
(
lgb
.
basic
.
LightGBMError
,
match
=
"Expected 'x2' at position 1 but found 'z'"
):
model
.
predict
(
df2
,
validate_features
=
True
)
model
.
predict
(
df2
,
validate_features
=
True
)
...
@@ -1419,59 +1407,59 @@ def test_validate_features(task):
...
@@ -1419,59 +1407,59 @@ def test_validate_features(task):
model
.
predict
(
df2
,
validate_features
=
False
)
model
.
predict
(
df2
,
validate_features
=
False
)
@
pytest
.
mark
.
parametrize
(
'
X_type
'
,
[
'
dt_DataTable
'
,
'
list2d
'
,
'
numpy
'
,
'
scipy_csc
'
,
'
scipy_csr
'
,
'
pd_DataFrame
'
])
@
pytest
.
mark
.
parametrize
(
"
X_type
"
,
[
"
dt_DataTable
"
,
"
list2d
"
,
"
numpy
"
,
"
scipy_csc
"
,
"
scipy_csr
"
,
"
pd_DataFrame
"
])
@
pytest
.
mark
.
parametrize
(
'
y_type
'
,
[
'
list1d
'
,
'
numpy
'
,
'
pd_Series
'
,
'
pd_DataFrame
'
])
@
pytest
.
mark
.
parametrize
(
"
y_type
"
,
[
"
list1d
"
,
"
numpy
"
,
"
pd_Series
"
,
"
pd_DataFrame
"
])
@
pytest
.
mark
.
parametrize
(
'
task
'
,
[
'
binary-classification
'
,
'
multiclass-classification
'
,
'
regression
'
])
@
pytest
.
mark
.
parametrize
(
"
task
"
,
[
"
binary-classification
"
,
"
multiclass-classification
"
,
"
regression
"
])
def
test_classification_and_regression_minimally_work_with_all_all_accepted_data_types
(
X_type
,
y_type
,
task
):
def
test_classification_and_regression_minimally_work_with_all_all_accepted_data_types
(
X_type
,
y_type
,
task
):
if
any
(
t
.
startswith
(
"pd_"
)
for
t
in
[
X_type
,
y_type
])
and
not
PANDAS_INSTALLED
:
if
any
(
t
.
startswith
(
"pd_"
)
for
t
in
[
X_type
,
y_type
])
and
not
PANDAS_INSTALLED
:
pytest
.
skip
(
'
pandas is not installed
'
)
pytest
.
skip
(
"
pandas is not installed
"
)
if
any
(
t
.
startswith
(
"dt_"
)
for
t
in
[
X_type
,
y_type
])
and
not
DATATABLE_INSTALLED
:
if
any
(
t
.
startswith
(
"dt_"
)
for
t
in
[
X_type
,
y_type
])
and
not
DATATABLE_INSTALLED
:
pytest
.
skip
(
'
datatable is not installed
'
)
pytest
.
skip
(
"
datatable is not installed
"
)
X
,
y
,
g
=
_create_data
(
task
,
n_samples
=
2_000
)
X
,
y
,
g
=
_create_data
(
task
,
n_samples
=
2_000
)
weights
=
np
.
abs
(
np
.
random
.
randn
(
y
.
shape
[
0
]))
weights
=
np
.
abs
(
np
.
random
.
randn
(
y
.
shape
[
0
]))
if
task
==
'
binary-classification
'
or
task
==
'
regression
'
:
if
task
==
"
binary-classification
"
or
task
==
"
regression
"
:
init_score
=
np
.
full_like
(
y
,
np
.
mean
(
y
))
init_score
=
np
.
full_like
(
y
,
np
.
mean
(
y
))
elif
task
==
'
multiclass-classification
'
:
elif
task
==
"
multiclass-classification
"
:
init_score
=
np
.
outer
(
y
,
np
.
array
([
0.1
,
0.2
,
0.7
]))
init_score
=
np
.
outer
(
y
,
np
.
array
([
0.1
,
0.2
,
0.7
]))
else
:
else
:
raise
ValueError
(
f
"Unrecognized task '
{
task
}
'"
)
raise
ValueError
(
f
"Unrecognized task '
{
task
}
'"
)
X_valid
=
X
*
2
X_valid
=
X
*
2
if
X_type
==
'
dt_DataTable
'
:
if
X_type
==
"
dt_DataTable
"
:
X
=
dt_DataTable
(
X
)
X
=
dt_DataTable
(
X
)
elif
X_type
==
'
list2d
'
:
elif
X_type
==
"
list2d
"
:
X
=
X
.
tolist
()
X
=
X
.
tolist
()
elif
X_type
==
'
scipy_csc
'
:
elif
X_type
==
"
scipy_csc
"
:
X
=
scipy
.
sparse
.
csc_matrix
(
X
)
X
=
scipy
.
sparse
.
csc_matrix
(
X
)
elif
X_type
==
'
scipy_csr
'
:
elif
X_type
==
"
scipy_csr
"
:
X
=
scipy
.
sparse
.
csr_matrix
(
X
)
X
=
scipy
.
sparse
.
csr_matrix
(
X
)
elif
X_type
==
'
pd_DataFrame
'
:
elif
X_type
==
"
pd_DataFrame
"
:
X
=
pd_DataFrame
(
X
)
X
=
pd_DataFrame
(
X
)
elif
X_type
!=
'
numpy
'
:
elif
X_type
!=
"
numpy
"
:
raise
ValueError
(
f
"Unrecognized X_type: '
{
X_type
}
'"
)
raise
ValueError
(
f
"Unrecognized X_type: '
{
X_type
}
'"
)
# make weights and init_score same types as y, just to avoid
# make weights and init_score same types as y, just to avoid
# a huge number of combinations and therefore test cases
# a huge number of combinations and therefore test cases
if
y_type
==
'
list1d
'
:
if
y_type
==
"
list1d
"
:
y
=
y
.
tolist
()
y
=
y
.
tolist
()
weights
=
weights
.
tolist
()
weights
=
weights
.
tolist
()
init_score
=
init_score
.
tolist
()
init_score
=
init_score
.
tolist
()
elif
y_type
==
'
pd_DataFrame
'
:
elif
y_type
==
"
pd_DataFrame
"
:
y
=
pd_DataFrame
(
y
)
y
=
pd_DataFrame
(
y
)
weights
=
pd_Series
(
weights
)
weights
=
pd_Series
(
weights
)
if
task
==
'
multiclass-classification
'
:
if
task
==
"
multiclass-classification
"
:
init_score
=
pd_DataFrame
(
init_score
)
init_score
=
pd_DataFrame
(
init_score
)
else
:
else
:
init_score
=
pd_Series
(
init_score
)
init_score
=
pd_Series
(
init_score
)
elif
y_type
==
'
pd_Series
'
:
elif
y_type
==
"
pd_Series
"
:
y
=
pd_Series
(
y
)
y
=
pd_Series
(
y
)
weights
=
pd_Series
(
weights
)
weights
=
pd_Series
(
weights
)
if
task
==
'
multiclass-classification
'
:
if
task
==
"
multiclass-classification
"
:
init_score
=
pd_DataFrame
(
init_score
)
init_score
=
pd_DataFrame
(
init_score
)
else
:
else
:
init_score
=
pd_Series
(
init_score
)
init_score
=
pd_Series
(
init_score
)
elif
y_type
!=
'
numpy
'
:
elif
y_type
!=
"
numpy
"
:
raise
ValueError
(
f
"Unrecognized y_type: '
{
y_type
}
'"
)
raise
ValueError
(
f
"Unrecognized y_type: '
{
y_type
}
'"
)
model
=
task_to_model_factory
[
task
](
n_estimators
=
10
,
verbose
=-
1
)
model
=
task_to_model_factory
[
task
](
n_estimators
=
10
,
verbose
=-
1
)
...
@@ -1482,73 +1470,73 @@ def test_classification_and_regression_minimally_work_with_all_all_accepted_data
...
@@ -1482,73 +1470,73 @@ def test_classification_and_regression_minimally_work_with_all_all_accepted_data
init_score
=
init_score
,
init_score
=
init_score
,
eval_set
=
[(
X_valid
,
y
)],
eval_set
=
[(
X_valid
,
y
)],
eval_sample_weight
=
[
weights
],
eval_sample_weight
=
[
weights
],
eval_init_score
=
[
init_score
]
eval_init_score
=
[
init_score
]
,
)
)
preds
=
model
.
predict
(
X
)
preds
=
model
.
predict
(
X
)
if
task
==
'
binary-classification
'
:
if
task
==
"
binary-classification
"
:
assert
accuracy_score
(
y
,
preds
)
>=
0.99
assert
accuracy_score
(
y
,
preds
)
>=
0.99
elif
task
==
'
multiclass-classification
'
:
elif
task
==
"
multiclass-classification
"
:
assert
accuracy_score
(
y
,
preds
)
>=
0.99
assert
accuracy_score
(
y
,
preds
)
>=
0.99
elif
task
==
'
regression
'
:
elif
task
==
"
regression
"
:
assert
r2_score
(
y
,
preds
)
>
0.86
assert
r2_score
(
y
,
preds
)
>
0.86
else
:
else
:
raise
ValueError
(
f
"Unrecognized task: '
{
task
}
'"
)
raise
ValueError
(
f
"Unrecognized task: '
{
task
}
'"
)
@
pytest
.
mark
.
parametrize
(
'
X_type
'
,
[
'
dt_DataTable
'
,
'
list2d
'
,
'
numpy
'
,
'
scipy_csc
'
,
'
scipy_csr
'
,
'
pd_DataFrame
'
])
@
pytest
.
mark
.
parametrize
(
"
X_type
"
,
[
"
dt_DataTable
"
,
"
list2d
"
,
"
numpy
"
,
"
scipy_csc
"
,
"
scipy_csr
"
,
"
pd_DataFrame
"
])
@
pytest
.
mark
.
parametrize
(
'
y_type
'
,
[
'
list1d
'
,
'
numpy
'
,
'
pd_DataFrame
'
,
'
pd_Series
'
])
@
pytest
.
mark
.
parametrize
(
"
y_type
"
,
[
"
list1d
"
,
"
numpy
"
,
"
pd_DataFrame
"
,
"
pd_Series
"
])
@
pytest
.
mark
.
parametrize
(
'
g_type
'
,
[
'
list1d_float
'
,
'
list1d_int
'
,
'
numpy
'
,
'
pd_Series
'
])
@
pytest
.
mark
.
parametrize
(
"
g_type
"
,
[
"
list1d_float
"
,
"
list1d_int
"
,
"
numpy
"
,
"
pd_Series
"
])
def
test_ranking_minimally_works_with_all_all_accepted_data_types
(
X_type
,
y_type
,
g_type
):
def
test_ranking_minimally_works_with_all_all_accepted_data_types
(
X_type
,
y_type
,
g_type
):
if
any
(
t
.
startswith
(
"pd_"
)
for
t
in
[
X_type
,
y_type
,
g_type
])
and
not
PANDAS_INSTALLED
:
if
any
(
t
.
startswith
(
"pd_"
)
for
t
in
[
X_type
,
y_type
,
g_type
])
and
not
PANDAS_INSTALLED
:
pytest
.
skip
(
'
pandas is not installed
'
)
pytest
.
skip
(
"
pandas is not installed
"
)
if
any
(
t
.
startswith
(
"dt_"
)
for
t
in
[
X_type
,
y_type
,
g_type
])
and
not
DATATABLE_INSTALLED
:
if
any
(
t
.
startswith
(
"dt_"
)
for
t
in
[
X_type
,
y_type
,
g_type
])
and
not
DATATABLE_INSTALLED
:
pytest
.
skip
(
'
datatable is not installed
'
)
pytest
.
skip
(
"
datatable is not installed
"
)
X
,
y
,
g
=
_create_data
(
task
=
'
ranking
'
,
n_samples
=
1_000
)
X
,
y
,
g
=
_create_data
(
task
=
"
ranking
"
,
n_samples
=
1_000
)
weights
=
np
.
abs
(
np
.
random
.
randn
(
y
.
shape
[
0
]))
weights
=
np
.
abs
(
np
.
random
.
randn
(
y
.
shape
[
0
]))
init_score
=
np
.
full_like
(
y
,
np
.
mean
(
y
))
init_score
=
np
.
full_like
(
y
,
np
.
mean
(
y
))
X_valid
=
X
*
2
X_valid
=
X
*
2
if
X_type
==
'
dt_DataTable
'
:
if
X_type
==
"
dt_DataTable
"
:
X
=
dt_DataTable
(
X
)
X
=
dt_DataTable
(
X
)
elif
X_type
==
'
list2d
'
:
elif
X_type
==
"
list2d
"
:
X
=
X
.
tolist
()
X
=
X
.
tolist
()
elif
X_type
==
'
scipy_csc
'
:
elif
X_type
==
"
scipy_csc
"
:
X
=
scipy
.
sparse
.
csc_matrix
(
X
)
X
=
scipy
.
sparse
.
csc_matrix
(
X
)
elif
X_type
==
'
scipy_csr
'
:
elif
X_type
==
"
scipy_csr
"
:
X
=
scipy
.
sparse
.
csr_matrix
(
X
)
X
=
scipy
.
sparse
.
csr_matrix
(
X
)
elif
X_type
==
'
pd_DataFrame
'
:
elif
X_type
==
"
pd_DataFrame
"
:
X
=
pd_DataFrame
(
X
)
X
=
pd_DataFrame
(
X
)
elif
X_type
!=
'
numpy
'
:
elif
X_type
!=
"
numpy
"
:
raise
ValueError
(
f
"Unrecognized X_type: '
{
X_type
}
'"
)
raise
ValueError
(
f
"Unrecognized X_type: '
{
X_type
}
'"
)
# make weights and init_score same types as y, just to avoid
# make weights and init_score same types as y, just to avoid
# a huge number of combinations and therefore test cases
# a huge number of combinations and therefore test cases
if
y_type
==
'
list1d
'
:
if
y_type
==
"
list1d
"
:
y
=
y
.
tolist
()
y
=
y
.
tolist
()
weights
=
weights
.
tolist
()
weights
=
weights
.
tolist
()
init_score
=
init_score
.
tolist
()
init_score
=
init_score
.
tolist
()
elif
y_type
==
'
pd_DataFrame
'
:
elif
y_type
==
"
pd_DataFrame
"
:
y
=
pd_DataFrame
(
y
)
y
=
pd_DataFrame
(
y
)
weights
=
pd_Series
(
weights
)
weights
=
pd_Series
(
weights
)
init_score
=
pd_Series
(
init_score
)
init_score
=
pd_Series
(
init_score
)
elif
y_type
==
'
pd_Series
'
:
elif
y_type
==
"
pd_Series
"
:
y
=
pd_Series
(
y
)
y
=
pd_Series
(
y
)
weights
=
pd_Series
(
weights
)
weights
=
pd_Series
(
weights
)
init_score
=
pd_Series
(
init_score
)
init_score
=
pd_Series
(
init_score
)
elif
y_type
!=
'
numpy
'
:
elif
y_type
!=
"
numpy
"
:
raise
ValueError
(
f
"Unrecognized y_type: '
{
y_type
}
'"
)
raise
ValueError
(
f
"Unrecognized y_type: '
{
y_type
}
'"
)
if
g_type
==
'
list1d_float
'
:
if
g_type
==
"
list1d_float
"
:
g
=
g
.
astype
(
"float"
).
tolist
()
g
=
g
.
astype
(
"float"
).
tolist
()
elif
g_type
==
'
list1d_int
'
:
elif
g_type
==
"
list1d_int
"
:
g
=
g
.
astype
(
"int"
).
tolist
()
g
=
g
.
astype
(
"int"
).
tolist
()
elif
g_type
==
'
pd_Series
'
:
elif
g_type
==
"
pd_Series
"
:
g
=
pd_Series
(
g
)
g
=
pd_Series
(
g
)
elif
g_type
!=
'
numpy
'
:
elif
g_type
!=
"
numpy
"
:
raise
ValueError
(
f
"Unrecognized g_type: '
{
g_type
}
'"
)
raise
ValueError
(
f
"Unrecognized g_type: '
{
g_type
}
'"
)
model
=
task_to_model_factory
[
'
ranking
'
](
n_estimators
=
10
,
verbose
=-
1
)
model
=
task_to_model_factory
[
"
ranking
"
](
n_estimators
=
10
,
verbose
=-
1
)
model
.
fit
(
model
.
fit
(
X
=
X
,
X
=
X
,
y
=
y
,
y
=
y
,
...
@@ -1558,7 +1546,7 @@ def test_ranking_minimally_works_with_all_all_accepted_data_types(X_type, y_type
...
@@ -1558,7 +1546,7 @@ def test_ranking_minimally_works_with_all_all_accepted_data_types(X_type, y_type
eval_set
=
[(
X_valid
,
y
)],
eval_set
=
[(
X_valid
,
y
)],
eval_sample_weight
=
[
weights
],
eval_sample_weight
=
[
weights
],
eval_init_score
=
[
init_score
],
eval_init_score
=
[
init_score
],
eval_group
=
[
g
]
eval_group
=
[
g
]
,
)
)
preds
=
model
.
predict
(
X
)
preds
=
model
.
predict
(
X
)
assert
spearmanr
(
preds
,
y
).
correlation
>=
0.99
assert
spearmanr
(
preds
,
y
).
correlation
>=
0.99
...
@@ -1570,7 +1558,7 @@ def test_classifier_fit_detects_classes_every_time():
...
@@ -1570,7 +1558,7 @@ def test_classifier_fit_detects_classes_every_time():
ncols
=
20
ncols
=
20
X
=
rng
.
standard_normal
(
size
=
(
nrows
,
ncols
))
X
=
rng
.
standard_normal
(
size
=
(
nrows
,
ncols
))
y_bin
=
(
rng
.
random
(
size
=
nrows
)
<=
.
3
).
astype
(
np
.
float64
)
y_bin
=
(
rng
.
random
(
size
=
nrows
)
<=
0
.3
).
astype
(
np
.
float64
)
y_multi
=
rng
.
integers
(
4
,
size
=
nrows
)
y_multi
=
rng
.
integers
(
4
,
size
=
nrows
)
model
=
lgb
.
LGBMClassifier
(
verbose
=-
1
)
model
=
lgb
.
LGBMClassifier
(
verbose
=-
1
)
...
...
tests/python_package_test/test_utilities.py
View file @
1b792e71
...
@@ -10,7 +10,7 @@ import lightgbm as lgb
...
@@ -10,7 +10,7 @@ import lightgbm as lgb
def
test_register_logger
(
tmp_path
):
def
test_register_logger
(
tmp_path
):
logger
=
logging
.
getLogger
(
"LightGBM"
)
logger
=
logging
.
getLogger
(
"LightGBM"
)
logger
.
setLevel
(
logging
.
DEBUG
)
logger
.
setLevel
(
logging
.
DEBUG
)
formatter
=
logging
.
Formatter
(
'
%(levelname)s | %(message)s
'
)
formatter
=
logging
.
Formatter
(
"
%(levelname)s | %(message)s
"
)
log_filename
=
tmp_path
/
"LightGBM_test_logger.log"
log_filename
=
tmp_path
/
"LightGBM_test_logger.log"
file_handler
=
logging
.
FileHandler
(
log_filename
,
mode
=
"w"
,
encoding
=
"utf-8"
)
file_handler
=
logging
.
FileHandler
(
log_filename
,
mode
=
"w"
,
encoding
=
"utf-8"
)
file_handler
.
setLevel
(
logging
.
DEBUG
)
file_handler
.
setLevel
(
logging
.
DEBUG
)
...
@@ -18,29 +18,27 @@ def test_register_logger(tmp_path):
...
@@ -18,29 +18,27 @@ def test_register_logger(tmp_path):
logger
.
addHandler
(
file_handler
)
logger
.
addHandler
(
file_handler
)
def
dummy_metric
(
_
,
__
):
def
dummy_metric
(
_
,
__
):
logger
.
debug
(
'
In dummy_metric
'
)
logger
.
debug
(
"
In dummy_metric
"
)
return
'
dummy_metric
'
,
1
,
True
return
"
dummy_metric
"
,
1
,
True
lgb
.
register_logger
(
logger
)
lgb
.
register_logger
(
logger
)
X
=
np
.
array
([[
1
,
2
,
3
],
X
=
np
.
array
([[
1
,
2
,
3
],
[
1
,
2
,
4
],
[
1
,
2
,
4
],
[
1
,
2
,
3
]],
dtype
=
np
.
float32
)
[
1
,
2
,
4
],
[
1
,
2
,
4
],
[
1
,
2
,
3
]],
dtype
=
np
.
float32
)
y
=
np
.
array
([
0
,
1
,
1
,
0
])
y
=
np
.
array
([
0
,
1
,
1
,
0
])
lgb_train
=
lgb
.
Dataset
(
X
,
y
)
lgb_train
=
lgb
.
Dataset
(
X
,
y
)
lgb_valid
=
lgb
.
Dataset
(
X
,
y
)
# different object for early-stopping
lgb_valid
=
lgb
.
Dataset
(
X
,
y
)
# different object for early-stopping
eval_records
=
{}
eval_records
=
{}
callbacks
=
[
callbacks
=
[
lgb
.
record_evaluation
(
eval_records
),
lgb
.
log_evaluation
(
2
),
lgb
.
early_stopping
(
10
)]
lgb
.
record_evaluation
(
eval_records
),
lgb
.
train
(
lgb
.
log_evaluation
(
2
),
{
"objective"
:
"binary"
,
"metric"
:
[
"auc"
,
"binary_error"
]},
lgb
.
early_stopping
(
10
)
lgb_train
,
]
num_boost_round
=
10
,
lgb
.
train
({
'objective'
:
'binary'
,
'metric'
:
[
'auc'
,
'binary_error'
]},
feval
=
dummy_metric
,
lgb_train
,
num_boost_round
=
10
,
feval
=
dummy_metric
,
valid_sets
=
[
lgb_valid
],
valid_sets
=
[
lgb_valid
],
categorical_feature
=
[
1
],
callbacks
=
callbacks
)
categorical_feature
=
[
1
],
callbacks
=
callbacks
,
)
lgb
.
plot_metric
(
eval_records
)
lgb
.
plot_metric
(
eval_records
)
...
@@ -89,7 +87,7 @@ WARNING | More than one metric available, picking one to plot.
...
@@ -89,7 +87,7 @@ WARNING | More than one metric available, picking one to plot.
"INFO | [LightGBM] [Warning] GPU acceleration is disabled because no non-trivial dense features can be found"
,
"INFO | [LightGBM] [Warning] GPU acceleration is disabled because no non-trivial dense features can be found"
,
"INFO | [LightGBM] [Warning] Using sparse features with CUDA is currently not supported."
,
"INFO | [LightGBM] [Warning] Using sparse features with CUDA is currently not supported."
,
"INFO | [LightGBM] [Warning] CUDA currently requires double precision calculations."
,
"INFO | [LightGBM] [Warning] CUDA currently requires double precision calculations."
,
"INFO | [LightGBM] [Info] LightGBM using CUDA trainer with DP float!!"
"INFO | [LightGBM] [Info] LightGBM using CUDA trainer with DP float!!"
,
]
]
cuda_lines
=
[
cuda_lines
=
[
"INFO | [LightGBM] [Warning] Metric auc is not implemented in cuda version. Fall back to evaluation on CPU."
,
"INFO | [LightGBM] [Warning] Metric auc is not implemented in cuda version. Fall back to evaluation on CPU."
,
...
@@ -142,11 +140,7 @@ def test_register_custom_logger():
...
@@ -142,11 +140,7 @@ def test_register_custom_logger():
logged_messages
.
append
(
msg
)
logged_messages
.
append
(
msg
)
custom_logger
=
CustomLogger
()
custom_logger
=
CustomLogger
()
lgb
.
register_logger
(
lgb
.
register_logger
(
custom_logger
,
info_method_name
=
"custom_info"
,
warning_method_name
=
"custom_warning"
)
custom_logger
,
info_method_name
=
"custom_info"
,
warning_method_name
=
"custom_warning"
)
lgb
.
basic
.
_log_info
(
"info message"
)
lgb
.
basic
.
_log_info
(
"info message"
)
lgb
.
basic
.
_log_warning
(
"warning message"
)
lgb
.
basic
.
_log_warning
(
"warning message"
)
...
@@ -155,18 +149,14 @@ def test_register_custom_logger():
...
@@ -155,18 +149,14 @@ def test_register_custom_logger():
assert
logged_messages
==
expected_log
assert
logged_messages
==
expected_log
logged_messages
=
[]
logged_messages
=
[]
X
=
np
.
array
([[
1
,
2
,
3
],
X
=
np
.
array
([[
1
,
2
,
3
],
[
1
,
2
,
4
],
[
1
,
2
,
4
],
[
1
,
2
,
3
]],
dtype
=
np
.
float32
)
[
1
,
2
,
4
],
[
1
,
2
,
4
],
[
1
,
2
,
3
]],
dtype
=
np
.
float32
)
y
=
np
.
array
([
0
,
1
,
1
,
0
])
y
=
np
.
array
([
0
,
1
,
1
,
0
])
lgb_data
=
lgb
.
Dataset
(
X
,
y
)
lgb_data
=
lgb
.
Dataset
(
X
,
y
)
lgb
.
train
(
lgb
.
train
(
{
'
objective
'
:
'
binary
'
,
'
metric
'
:
'
auc
'
},
{
"
objective
"
:
"
binary
"
,
"
metric
"
:
"
auc
"
},
lgb_data
,
lgb_data
,
num_boost_round
=
10
,
num_boost_round
=
10
,
valid_sets
=
[
lgb_data
],
valid_sets
=
[
lgb_data
],
categorical_feature
=
[
1
]
categorical_feature
=
[
1
]
,
)
)
assert
logged_messages
,
"custom logger was not called"
assert
logged_messages
,
"custom logger was not called"
tests/python_package_test/utils.py
View file @
1b792e71
...
@@ -34,8 +34,9 @@ def load_linnerud(**kwargs):
...
@@ -34,8 +34,9 @@ def load_linnerud(**kwargs):
return
sklearn
.
datasets
.
load_linnerud
(
**
kwargs
)
return
sklearn
.
datasets
.
load_linnerud
(
**
kwargs
)
def
make_ranking
(
n_samples
=
100
,
n_features
=
20
,
n_informative
=
5
,
gmax
=
2
,
def
make_ranking
(
group
=
None
,
random_gs
=
False
,
avg_gs
=
10
,
random_state
=
0
):
n_samples
=
100
,
n_features
=
20
,
n_informative
=
5
,
gmax
=
2
,
group
=
None
,
random_gs
=
False
,
avg_gs
=
10
,
random_state
=
0
):
"""Generate a learning-to-rank dataset - feature vectors grouped together with
"""Generate a learning-to-rank dataset - feature vectors grouped together with
integer-valued graded relevance scores. Replace this with a sklearn.datasets function
integer-valued graded relevance scores. Replace this with a sklearn.datasets function
if ranking objective becomes supported in sklearn.datasets module.
if ranking objective becomes supported in sklearn.datasets module.
...
@@ -81,7 +82,7 @@ def make_ranking(n_samples=100, n_features=20, n_informative=5, gmax=2,
...
@@ -81,7 +82,7 @@ def make_ranking(n_samples=100, n_features=20, n_informative=5, gmax=2,
relvalues
=
range
(
gmax
+
1
)
relvalues
=
range
(
gmax
+
1
)
# build y/target and group-id vectors with user-specified group sizes.
# build y/target and group-id vectors with user-specified group sizes.
if
group
is
not
None
and
hasattr
(
group
,
'
__len__
'
):
if
group
is
not
None
and
hasattr
(
group
,
"
__len__
"
):
n_samples
=
np
.
sum
(
group
)
n_samples
=
np
.
sum
(
group
)
for
i
,
gsize
in
enumerate
(
group
):
for
i
,
gsize
in
enumerate
(
group
):
...
@@ -116,8 +117,9 @@ def make_ranking(n_samples=100, n_features=20, n_informative=5, gmax=2,
...
@@ -116,8 +117,9 @@ def make_ranking(n_samples=100, n_features=20, n_informative=5, gmax=2,
@
lru_cache
(
maxsize
=
None
)
@
lru_cache
(
maxsize
=
None
)
def
make_synthetic_regression
(
n_samples
=
100
,
n_features
=
4
,
n_informative
=
2
,
random_state
=
42
):
def
make_synthetic_regression
(
n_samples
=
100
,
n_features
=
4
,
n_informative
=
2
,
random_state
=
42
):
return
sklearn
.
datasets
.
make_regression
(
n_samples
=
n_samples
,
n_features
=
n_features
,
return
sklearn
.
datasets
.
make_regression
(
n_informative
=
n_informative
,
random_state
=
random_state
)
n_samples
=
n_samples
,
n_features
=
n_features
,
n_informative
=
n_informative
,
random_state
=
random_state
)
def
dummy_obj
(
preds
,
train_data
):
def
dummy_obj
(
preds
,
train_data
):
...
@@ -126,7 +128,7 @@ def dummy_obj(preds, train_data):
...
@@ -126,7 +128,7 @@ def dummy_obj(preds, train_data):
def
mse_obj
(
y_pred
,
dtrain
):
def
mse_obj
(
y_pred
,
dtrain
):
y_true
=
dtrain
.
get_label
()
y_true
=
dtrain
.
get_label
()
grad
=
(
y_pred
-
y_true
)
grad
=
y_pred
-
y_true
hess
=
np
.
ones
(
len
(
grad
))
hess
=
np
.
ones
(
len
(
grad
))
return
grad
,
hess
return
grad
,
hess
...
@@ -157,50 +159,41 @@ def sklearn_multiclass_custom_objective(y_true, y_pred, weight=None):
...
@@ -157,50 +159,41 @@ def sklearn_multiclass_custom_objective(y_true, y_pred, weight=None):
def
pickle_obj
(
obj
,
filepath
,
serializer
):
def
pickle_obj
(
obj
,
filepath
,
serializer
):
if
serializer
==
'
pickle
'
:
if
serializer
==
"
pickle
"
:
with
open
(
filepath
,
'
wb
'
)
as
f
:
with
open
(
filepath
,
"
wb
"
)
as
f
:
pickle
.
dump
(
obj
,
f
)
pickle
.
dump
(
obj
,
f
)
elif
serializer
==
'
joblib
'
:
elif
serializer
==
"
joblib
"
:
joblib
.
dump
(
obj
,
filepath
)
joblib
.
dump
(
obj
,
filepath
)
elif
serializer
==
'
cloudpickle
'
:
elif
serializer
==
"
cloudpickle
"
:
with
open
(
filepath
,
'
wb
'
)
as
f
:
with
open
(
filepath
,
"
wb
"
)
as
f
:
cloudpickle
.
dump
(
obj
,
f
)
cloudpickle
.
dump
(
obj
,
f
)
else
:
else
:
raise
ValueError
(
f
'
Unrecognized serializer type:
{
serializer
}
'
)
raise
ValueError
(
f
"
Unrecognized serializer type:
{
serializer
}
"
)
def
unpickle_obj
(
filepath
,
serializer
):
def
unpickle_obj
(
filepath
,
serializer
):
if
serializer
==
'
pickle
'
:
if
serializer
==
"
pickle
"
:
with
open
(
filepath
,
'
rb
'
)
as
f
:
with
open
(
filepath
,
"
rb
"
)
as
f
:
return
pickle
.
load
(
f
)
return
pickle
.
load
(
f
)
elif
serializer
==
'
joblib
'
:
elif
serializer
==
"
joblib
"
:
return
joblib
.
load
(
filepath
)
return
joblib
.
load
(
filepath
)
elif
serializer
==
'
cloudpickle
'
:
elif
serializer
==
"
cloudpickle
"
:
with
open
(
filepath
,
'
rb
'
)
as
f
:
with
open
(
filepath
,
"
rb
"
)
as
f
:
return
cloudpickle
.
load
(
f
)
return
cloudpickle
.
load
(
f
)
else
:
else
:
raise
ValueError
(
f
'
Unrecognized serializer type:
{
serializer
}
'
)
raise
ValueError
(
f
"
Unrecognized serializer type:
{
serializer
}
"
)
def
pickle_and_unpickle_object
(
obj
,
serializer
):
def
pickle_and_unpickle_object
(
obj
,
serializer
):
with
lgb
.
basic
.
_TempFile
()
as
tmp_file
:
with
lgb
.
basic
.
_TempFile
()
as
tmp_file
:
pickle_obj
(
pickle_obj
(
obj
=
obj
,
filepath
=
tmp_file
.
name
,
serializer
=
serializer
)
obj
=
obj
,
obj_from_disk
=
unpickle_obj
(
filepath
=
tmp_file
.
name
,
serializer
=
serializer
)
filepath
=
tmp_file
.
name
,
serializer
=
serializer
)
obj_from_disk
=
unpickle_obj
(
filepath
=
tmp_file
.
name
,
serializer
=
serializer
)
return
obj_from_disk
# noqa: RET504
return
obj_from_disk
# noqa: RET504
# doing this here, at import time, to ensure it only runs once_per import
# doing this here, at import time, to ensure it only runs once_per import
# instead of once per assertion
# instead of once per assertion
_numpy_testing_supports_strict_kwarg
=
(
_numpy_testing_supports_strict_kwarg
=
"strict"
in
getfullargspec
(
np
.
testing
.
assert_array_equal
).
kwonlyargs
"strict"
in
getfullargspec
(
np
.
testing
.
assert_array_equal
).
kwonlyargs
)
def
np_assert_array_equal
(
*
args
,
**
kwargs
):
def
np_assert_array_equal
(
*
args
,
**
kwargs
):
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment