Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
tianlh
LightGBM-DCU
Commits
da443871
Unverified
Commit
da443871
authored
Jan 24, 2021
by
Nikita Titov
Committed by
GitHub
Jan 23, 2021
Browse files
[dask][tests] move make_ranking into utils (#3827)
* move make_ranking into utils * do not cache
parent
73633789
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
88 additions
and
85 deletions
+88
-85
tests/python_package_test/test_dask.py
tests/python_package_test/test_dask.py
+6
-85
tests/python_package_test/utils.py
tests/python_package_test/utils.py
+82
-0
No files found.
tests/python_package_test/test_dask.py
View file @
da443871
...
@@ -25,6 +25,9 @@ from sklearn.utils import check_random_state
...
@@ -25,6 +25,9 @@ from sklearn.utils import check_random_state
import
lightgbm
import
lightgbm
import
lightgbm.dask
as
dlgbm
import
lightgbm.dask
as
dlgbm
from
.utils
import
make_ranking
data_output
=
[
'array'
,
'scipy_csr_matrix'
,
'dataframe'
]
data_output
=
[
'array'
,
'scipy_csr_matrix'
,
'dataframe'
]
data_centers
=
[[[
-
4
,
-
4
],
[
4
,
4
]],
[[
-
4
,
-
4
],
[
4
,
4
],
[
-
4
,
4
]]]
data_centers
=
[[[
-
4
,
-
4
],
[
4
,
4
]],
[[
-
4
,
-
4
],
[
4
,
4
],
[
-
4
,
4
]]]
group_sizes
=
[
5
,
5
,
5
,
10
,
10
,
10
,
20
,
20
,
20
,
50
,
50
]
group_sizes
=
[
5
,
5
,
5
,
10
,
10
,
10
,
20
,
20
,
20
,
50
,
50
]
...
@@ -44,92 +47,13 @@ def listen_port():
...
@@ -44,92 +47,13 @@ def listen_port():
listen_port
.
port
=
13000
listen_port
.
port
=
13000
def
_make_ranking
(
n_samples
=
100
,
n_features
=
20
,
n_informative
=
5
,
gmax
=
2
,
group
=
None
,
random_gs
=
False
,
avg_gs
=
10
,
random_state
=
0
):
"""Generate a learning-to-rank dataset - feature vectors grouped together with
integer-valued graded relevance scores. Replace this with a sklearn.datasets function
if ranking objective becomes supported in sklearn.datasets module.
Parameters
----------
n_samples : int, optional (default=100)
Total number of documents (records) in the dataset.
n_features : int, optional (default=20)
Total number of features in the dataset.
n_informative : int, optional (default=5)
Number of features that are "informative" for ranking, as they are bias + beta * y
where bias and beta are standard normal variates. If this is greater than n_features, the dataset will have
n_features features, all will be informative.
group : array-like, optional (default=None)
1-d array or list of group sizes. When `group` is specified, this overrides n_samples, random_gs, and
avg_gs by simply creating groups with sizes group[0], ..., group[-1].
gmax : int, optional (default=2)
Maximum graded relevance value for creating relevance/target vector. If you set this to 2, for example, all
documents in a group will have relevance scores of either 0, 1, or 2.
random_gs : bool, optional (default=False)
True will make group sizes ~ Poisson(avg_gs), False will make group sizes == avg_gs.
avg_gs : int, optional (default=10)
Average number of documents (records) in each group.
Returns
-------
X : 2-d np.ndarray of shape = [n_samples (or np.sum(group)), n_features]
Input feature matrix for ranking objective.
y : 1-d np.array of shape = [n_samples (or np.sum(group))]
Integer-graded relevance scores.
group_ids : 1-d np.array of shape = [n_samples (or np.sum(group))]
Array of group ids, each value indicates to which group each record belongs.
"""
rnd_generator
=
check_random_state
(
random_state
)
y_vec
,
group_id_vec
=
np
.
empty
((
0
,),
dtype
=
int
),
np
.
empty
((
0
,),
dtype
=
int
)
gid
=
0
# build target, group ID vectors.
relvalues
=
range
(
gmax
+
1
)
# build y/target and group-id vectors with user-specified group sizes.
if
group
is
not
None
and
hasattr
(
group
,
'__len__'
):
n_samples
=
np
.
sum
(
group
)
for
i
,
gsize
in
enumerate
(
group
):
y_vec
=
np
.
concatenate
((
y_vec
,
rnd_generator
.
choice
(
relvalues
,
size
=
gsize
,
replace
=
True
)))
group_id_vec
=
np
.
concatenate
((
group_id_vec
,
[
i
]
*
gsize
))
# build y/target and group-id vectors according to n_samples, avg_gs, and random_gs.
else
:
while
len
(
y_vec
)
<
n_samples
:
gsize
=
avg_gs
if
not
random_gs
else
rnd_generator
.
poisson
(
avg_gs
)
# groups should contain > 1 element for pairwise learning objective.
if
gsize
<
1
:
continue
y_vec
=
np
.
append
(
y_vec
,
rnd_generator
.
choice
(
relvalues
,
size
=
gsize
,
replace
=
True
))
group_id_vec
=
np
.
append
(
group_id_vec
,
[
gid
]
*
gsize
)
gid
+=
1
y_vec
,
group_id_vec
=
y_vec
[:
n_samples
],
group_id_vec
[:
n_samples
]
# build feature data, X. Transform first few into informative features.
n_informative
=
max
(
min
(
n_features
,
n_informative
),
0
)
X
=
rnd_generator
.
uniform
(
size
=
(
n_samples
,
n_features
))
for
j
in
range
(
n_informative
):
bias
,
coef
=
rnd_generator
.
normal
(
size
=
2
)
X
[:,
j
]
=
bias
+
coef
*
y_vec
return
X
,
y_vec
,
group_id_vec
def
_create_ranking_data
(
n_samples
=
100
,
output
=
'array'
,
chunk_size
=
50
,
**
kwargs
):
def
_create_ranking_data
(
n_samples
=
100
,
output
=
'array'
,
chunk_size
=
50
,
**
kwargs
):
X
,
y
,
g
=
_
make_ranking
(
n_samples
=
n_samples
,
random_state
=
42
,
**
kwargs
)
X
,
y
,
g
=
make_ranking
(
n_samples
=
n_samples
,
random_state
=
42
,
**
kwargs
)
rnd
=
np
.
random
.
RandomState
(
42
)
rnd
=
np
.
random
.
RandomState
(
42
)
w
=
rnd
.
rand
(
X
.
shape
[
0
])
*
0.01
w
=
rnd
.
rand
(
X
.
shape
[
0
])
*
0.01
g_rle
=
np
.
array
([
len
(
list
(
grp
))
for
_
,
grp
in
itertools
.
groupby
(
g
)])
g_rle
=
np
.
array
([
len
(
list
(
grp
))
for
_
,
grp
in
itertools
.
groupby
(
g
)])
if
output
==
'dataframe'
:
if
output
==
'dataframe'
:
# add target, weight, and group to DataFrame so that partitions abide by group boundaries.
# add target, weight, and group to DataFrame so that partitions abide by group boundaries.
X_df
=
pd
.
DataFrame
(
X
,
columns
=
[
f
'feature_
{
i
}
'
for
i
in
range
(
X
.
shape
[
1
])])
X_df
=
pd
.
DataFrame
(
X
,
columns
=
[
f
'feature_
{
i
}
'
for
i
in
range
(
X
.
shape
[
1
])])
X
=
X_df
.
copy
()
X
=
X_df
.
copy
()
...
@@ -149,9 +73,7 @@ def _create_ranking_data(n_samples=100, output='array', chunk_size=50, **kwargs)
...
@@ -149,9 +73,7 @@ def _create_ranking_data(n_samples=100, output='array', chunk_size=50, **kwargs)
# encode group identifiers into run-length encoding, the format LightGBMRanker is expecting
# encode group identifiers into run-length encoding, the format LightGBMRanker is expecting
# so that within each partition, sum(g) = n_samples.
# so that within each partition, sum(g) = n_samples.
dg
=
dg
.
map_partitions
(
lambda
p
:
p
.
groupby
(
'g'
,
sort
=
False
).
apply
(
lambda
z
:
z
.
shape
[
0
]))
dg
=
dg
.
map_partitions
(
lambda
p
:
p
.
groupby
(
'g'
,
sort
=
False
).
apply
(
lambda
z
:
z
.
shape
[
0
]))
elif
output
==
'array'
:
elif
output
==
'array'
:
# ranking arrays: one chunk per group. Each chunk must include all columns.
# ranking arrays: one chunk per group. Each chunk must include all columns.
p
=
X
.
shape
[
1
]
p
=
X
.
shape
[
1
]
dX
,
dy
,
dw
,
dg
=
[],
[],
[],
[]
dX
,
dy
,
dw
,
dg
=
[],
[],
[],
[]
...
@@ -166,7 +88,6 @@ def _create_ranking_data(n_samples=100, output='array', chunk_size=50, **kwargs)
...
@@ -166,7 +88,6 @@ def _create_ranking_data(n_samples=100, output='array', chunk_size=50, **kwargs)
dy
=
da
.
concatenate
(
dy
,
axis
=
0
)
dy
=
da
.
concatenate
(
dy
,
axis
=
0
)
dw
=
da
.
concatenate
(
dw
,
axis
=
0
)
dw
=
da
.
concatenate
(
dw
,
axis
=
0
)
dg
=
da
.
concatenate
(
dg
,
axis
=
0
)
dg
=
da
.
concatenate
(
dg
,
axis
=
0
)
else
:
else
:
raise
ValueError
(
'Ranking data creation only supported for Dask arrays and dataframes'
)
raise
ValueError
(
'Ranking data creation only supported for Dask arrays and dataframes'
)
...
@@ -179,7 +100,7 @@ def _create_data(objective, n_samples=100, centers=2, output='array', chunk_size
...
@@ -179,7 +100,7 @@ def _create_data(objective, n_samples=100, centers=2, output='array', chunk_size
elif
objective
==
'regression'
:
elif
objective
==
'regression'
:
X
,
y
=
make_regression
(
n_samples
=
n_samples
,
random_state
=
42
)
X
,
y
=
make_regression
(
n_samples
=
n_samples
,
random_state
=
42
)
else
:
else
:
raise
ValueError
(
objective
)
raise
ValueError
(
"Unknown objective '%s'"
%
objective
)
rnd
=
np
.
random
.
RandomState
(
42
)
rnd
=
np
.
random
.
RandomState
(
42
)
weights
=
rnd
.
random
(
X
.
shape
[
0
])
*
0.01
weights
=
rnd
.
random
(
X
.
shape
[
0
])
*
0.01
...
@@ -198,7 +119,7 @@ def _create_data(objective, n_samples=100, centers=2, output='array', chunk_size
...
@@ -198,7 +119,7 @@ def _create_data(objective, n_samples=100, centers=2, output='array', chunk_size
dy
=
da
.
from_array
(
y
,
chunks
=
chunk_size
)
dy
=
da
.
from_array
(
y
,
chunks
=
chunk_size
)
dw
=
da
.
from_array
(
weights
,
chunk_size
)
dw
=
da
.
from_array
(
weights
,
chunk_size
)
else
:
else
:
raise
ValueError
(
"Unknown output type %s"
%
output
)
raise
ValueError
(
"Unknown output type
'
%s
'
"
%
output
)
return
X
,
y
,
weights
,
dX
,
dy
,
dw
return
X
,
y
,
weights
,
dX
,
dy
,
dw
...
...
tests/python_package_test/utils.py
View file @
da443871
# coding: utf-8
# coding: utf-8
from
functools
import
lru_cache
from
functools
import
lru_cache
import
numpy
as
np
import
sklearn.datasets
import
sklearn.datasets
from
sklearn.utils
import
check_random_state
@
lru_cache
(
maxsize
=
None
)
@
lru_cache
(
maxsize
=
None
)
...
@@ -27,3 +29,83 @@ def load_iris(**kwargs):
...
@@ -27,3 +29,83 @@ def load_iris(**kwargs):
@
lru_cache
(
maxsize
=
None
)
@
lru_cache
(
maxsize
=
None
)
def
load_linnerud
(
**
kwargs
):
def
load_linnerud
(
**
kwargs
):
return
sklearn
.
datasets
.
load_linnerud
(
**
kwargs
)
return
sklearn
.
datasets
.
load_linnerud
(
**
kwargs
)
def
make_ranking
(
n_samples
=
100
,
n_features
=
20
,
n_informative
=
5
,
gmax
=
2
,
group
=
None
,
random_gs
=
False
,
avg_gs
=
10
,
random_state
=
0
):
"""Generate a learning-to-rank dataset - feature vectors grouped together with
integer-valued graded relevance scores. Replace this with a sklearn.datasets function
if ranking objective becomes supported in sklearn.datasets module.
Parameters
----------
n_samples : int, optional (default=100)
Total number of documents (records) in the dataset.
n_features : int, optional (default=20)
Total number of features in the dataset.
n_informative : int, optional (default=5)
Number of features that are "informative" for ranking, as they are bias + beta * y
where bias and beta are standard normal variates. If this is greater than n_features, the dataset will have
n_features features, all will be informative.
gmax : int, optional (default=2)
Maximum graded relevance value for creating relevance/target vector. If you set this to 2, for example, all
documents in a group will have relevance scores of either 0, 1, or 2.
group : array-like, optional (default=None)
1-d array or list of group sizes. When `group` is specified, this overrides n_samples, random_gs, and
avg_gs by simply creating groups with sizes group[0], ..., group[-1].
random_gs : bool, optional (default=False)
True will make group sizes ~ Poisson(avg_gs), False will make group sizes == avg_gs.
avg_gs : int, optional (default=10)
Average number of documents (records) in each group.
random_state : int, optional (default=0)
Random seed.
Returns
-------
X : 2-d np.ndarray of shape = [n_samples (or np.sum(group)), n_features]
Input feature matrix for ranking objective.
y : 1-d np.array of shape = [n_samples (or np.sum(group))]
Integer-graded relevance scores.
group_ids : 1-d np.array of shape = [n_samples (or np.sum(group))]
Array of group ids, each value indicates to which group each record belongs.
"""
rnd_generator
=
check_random_state
(
random_state
)
y_vec
,
group_id_vec
=
np
.
empty
((
0
,),
dtype
=
int
),
np
.
empty
((
0
,),
dtype
=
int
)
gid
=
0
# build target, group ID vectors.
relvalues
=
range
(
gmax
+
1
)
# build y/target and group-id vectors with user-specified group sizes.
if
group
is
not
None
and
hasattr
(
group
,
'__len__'
):
n_samples
=
np
.
sum
(
group
)
for
i
,
gsize
in
enumerate
(
group
):
y_vec
=
np
.
concatenate
((
y_vec
,
rnd_generator
.
choice
(
relvalues
,
size
=
gsize
,
replace
=
True
)))
group_id_vec
=
np
.
concatenate
((
group_id_vec
,
[
i
]
*
gsize
))
# build y/target and group-id vectors according to n_samples, avg_gs, and random_gs.
else
:
while
len
(
y_vec
)
<
n_samples
:
gsize
=
avg_gs
if
not
random_gs
else
rnd_generator
.
poisson
(
avg_gs
)
# groups should contain > 1 element for pairwise learning objective.
if
gsize
<
1
:
continue
y_vec
=
np
.
append
(
y_vec
,
rnd_generator
.
choice
(
relvalues
,
size
=
gsize
,
replace
=
True
))
group_id_vec
=
np
.
append
(
group_id_vec
,
[
gid
]
*
gsize
)
gid
+=
1
y_vec
,
group_id_vec
=
y_vec
[:
n_samples
],
group_id_vec
[:
n_samples
]
# build feature data, X. Transform first few into informative features.
n_informative
=
max
(
min
(
n_features
,
n_informative
),
0
)
X
=
rnd_generator
.
uniform
(
size
=
(
n_samples
,
n_features
))
for
j
in
range
(
n_informative
):
bias
,
coef
=
rnd_generator
.
normal
(
size
=
2
)
X
[:,
j
]
=
bias
+
coef
*
y_vec
return
X
,
y_vec
,
group_id_vec
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment