[dask][tests] move make_ranking into utils (#3827)

* move make_ranking into utils * do not cache

[dask][tests] move make_ranking into utils (#3827)
* move make_ranking into utils * do not cache
da443871 · Nikita Titov · GitHub · 73633789 · da443871 · da443871
Unverified Commit da443871 authored Jan 24, 2021 by Nikita Titov Committed by GitHub Jan 23, 2021
Hide whitespace changes
Inline Side-by-side

Showing with 88 additions and 85 deletions

tests/python_package_test/test_dask.py tests/python_package_test/test_dask.py +6 -85

tests/python_package_test/utils.py tests/python_package_test/utils.py +82 -0

No files found.
--- a/tests/python_package_test/test_dask.py
+++ b/tests/python_package_test/test_dask.py
@@ -25,6 +25,9 @@ from sklearn.utils import check_random_state
 import lightgbm
 import lightgbm.dask as dlgbm
+from .utils import make_ranking
 data_output = ['array', 'scipy_csr_matrix', 'dataframe']
 data_centers = [[[-4, -4], [4, 4]], [[-4, -4], [4, 4], [-4, 4]]]
 group_sizes = [5, 5, 5, 10, 10, 10, 20, 20, 20, 50, 50]
@@ -44,92 +47,13 @@ def listen_port():
 listen_port.port = 13000
-def _make_ranking(n_samples=100, n_features=20, n_informative=5, gmax=2,
-                  group=None, random_gs=False, avg_gs=10, random_state=0):
-    """Generate a learning-to-rank dataset - feature vectors grouped together with
-    integer-valued graded relevance scores. Replace this with a sklearn.datasets function
-    if ranking objective becomes supported in sklearn.datasets module.
-    Parameters
-    ----------
-    n_samples : int, optional (default=100)
-        Total number of documents (records) in the dataset.
-    n_features : int, optional (default=20)
-        Total number of features in the dataset.
-    n_informative : int, optional (default=5)
-        Number of features that are "informative" for ranking, as they are bias + beta * y
-        where bias and beta are standard normal variates. If this is greater than n_features, the dataset will have
-        n_features features, all will be informative.
-    group : array-like, optional (default=None)
-        1-d array or list of group sizes. When `group` is specified, this overrides n_samples, random_gs, and
-        avg_gs by simply creating groups with sizes group[0], ..., group[-1].
-    gmax : int, optional (default=2)
-        Maximum graded relevance value for creating relevance/target vector. If you set this to 2, for example, all
-        documents in a group will have relevance scores of either 0, 1, or 2.
-    random_gs : bool, optional (default=False)
-        True will make group sizes ~ Poisson(avg_gs), False will make group sizes == avg_gs.
-    avg_gs : int, optional (default=10)
-        Average number of documents (records) in each group.
-    Returns
-    -------
-    X : 2-d np.ndarray of shape = [n_samples (or np.sum(group)), n_features]
-        Input feature matrix for ranking objective.
-    y : 1-d np.array of shape = [n_samples (or np.sum(group))]
-        Integer-graded relevance scores.
-    group_ids : 1-d np.array of shape = [n_samples (or np.sum(group))]
-        Array of group ids, each value indicates to which group each record belongs.
-    """
-    rnd_generator = check_random_state(random_state)
-    y_vec, group_id_vec = np.empty((0,), dtype=int), np.empty((0,), dtype=int)
-    gid = 0
-    # build target, group ID vectors.
-    relvalues = range(gmax + 1)
-    # build y/target and group-id vectors with user-specified group sizes.
-    if group is not None and hasattr(group, '__len__'):
-        n_samples = np.sum(group)
-        for i, gsize in enumerate(group):
-            y_vec = np.concatenate((y_vec, rnd_generator.choice(relvalues, size=gsize, replace=True)))
-            group_id_vec = np.concatenate((group_id_vec, [i] * gsize))
-    # build y/target and group-id vectors according to n_samples, avg_gs, and random_gs.
-    else:
-        while len(y_vec) < n_samples:
-            gsize = avg_gs if not random_gs else rnd_generator.poisson(avg_gs)
-            # groups should contain > 1 element for pairwise learning objective.
-            if gsize < 1:
-                continue
-            y_vec = np.append(y_vec, rnd_generator.choice(relvalues, size=gsize, replace=True))
-            group_id_vec = np.append(group_id_vec, [gid] * gsize)
-            gid += 1
-        y_vec, group_id_vec = y_vec[:n_samples], group_id_vec[:n_samples]
-    # build feature data, X. Transform first few into informative features.
-    n_informative = max(min(n_features, n_informative), 0)
-    X = rnd_generator.uniform(size=(n_samples, n_features))
-    for j in range(n_informative):
-        bias, coef = rnd_generator.normal(size=2)
-        X[:, j] = bias + coef * y_vec
-    return X, y_vec, group_id_vec
 def _create_ranking_data(n_samples=100, output='array', chunk_size=50, **kwargs):
-    X, y, g = _make_ranking(n_samples=n_samples, random_state=42, **kwargs)
+    X, y, g = make_ranking(n_samples=n_samples, random_state=42, **kwargs)
    rnd = np.random.RandomState(42)
    w = rnd.rand(X.shape[0]) * 0.01
    g_rle = np.array([len(list(grp)) for _, grp in itertools.groupby(g)])
    if output == 'dataframe':
        # add target, weight, and group to DataFrame so that partitions abide by group boundaries.
        X_df = pd.DataFrame(X, columns=[f'feature_{i}' for i in range(X.shape[1])])
        X = X_df.copy()
@@ -149,9 +73,7 @@ def _create_ranking_data(n_samples=100, output='array', chunk_size=50, **kwargs)
        # encode group identifiers into run-length encoding, the format LightGBMRanker is expecting
        # so that within each partition, sum(g) = n_samples.
        dg = dg.map_partitions(lambda p: p.groupby('g', sort=False).apply(lambda z: z.shape[0]))
    elif output == 'array':
        # ranking arrays: one chunk per group. Each chunk must include all columns.
        p = X.shape[1]
        dX, dy, dw, dg = [], [], [], []
@@ -166,7 +88,6 @@ def _create_ranking_data(n_samples=100, output='array', chunk_size=50, **kwargs)
        dy = da.concatenate(dy, axis=0)
        dw = da.concatenate(dw, axis=0)
        dg = da.concatenate(dg, axis=0)
    else:
        raise ValueError('Ranking data creation only supported for Dask arrays and dataframes')
@@ -179,7 +100,7 @@ def _create_data(objective, n_samples=100, centers=2, output='array', chunk_size
    elif objective == 'regression':
        X, y = make_regression(n_samples=n_samples, random_state=42)
    else:
-        raise ValueError(objective)
+        raise ValueError("Unknown objective '%s'" % objective)
    rnd = np.random.RandomState(42)
    weights = rnd.random(X.shape[0]) * 0.01
@@ -198,7 +119,7 @@ def _create_data(objective, n_samples=100, centers=2, output='array', chunk_size
        dy = da.from_array(y, chunks=chunk_size)
        dw = da.from_array(weights, chunk_size)
    else:
-        raise ValueError("Unknown output type %s" % output)
+        raise ValueError("Unknown output type '%s'" % output)
    return X, y, weights, dX, dy, dw

--- a/tests/python_package_test/utils.py
+++ b/tests/python_package_test/utils.py
 # coding: utf-8
 from functools import lru_cache
+import numpy as np
 import sklearn.datasets
+from sklearn.utils import check_random_state
 @lru_cache(maxsize=None)
@@ -27,3 +29,83 @@ def load_iris(**kwargs):
 @lru_cache(maxsize=None)
 def load_linnerud(**kwargs):
    return sklearn.datasets.load_linnerud(**kwargs)
+def make_ranking(n_samples=100, n_features=20, n_informative=5, gmax=2,
+                 group=None, random_gs=False, avg_gs=10, random_state=0):
+    """Generate a learning-to-rank dataset - feature vectors grouped together with
+    integer-valued graded relevance scores. Replace this with a sklearn.datasets function
+    if ranking objective becomes supported in sklearn.datasets module.
+    Parameters
+    ----------
+    n_samples : int, optional (default=100)
+        Total number of documents (records) in the dataset.
+    n_features : int, optional (default=20)
+        Total number of features in the dataset.
+    n_informative : int, optional (default=5)
+        Number of features that are "informative" for ranking, as they are bias + beta * y
+        where bias and beta are standard normal variates. If this is greater than n_features, the dataset will have
+        n_features features, all will be informative.
+    gmax : int, optional (default=2)
+        Maximum graded relevance value for creating relevance/target vector. If you set this to 2, for example, all
+        documents in a group will have relevance scores of either 0, 1, or 2.
+    group : array-like, optional (default=None)
+        1-d array or list of group sizes. When `group` is specified, this overrides n_samples, random_gs, and
+        avg_gs by simply creating groups with sizes group[0], ..., group[-1].
+    random_gs : bool, optional (default=False)
+        True will make group sizes ~ Poisson(avg_gs), False will make group sizes == avg_gs.
+    avg_gs : int, optional (default=10)
+        Average number of documents (records) in each group.
+    random_state : int, optional (default=0)
+        Random seed.
+    Returns
+    -------
+    X : 2-d np.ndarray of shape = [n_samples (or np.sum(group)), n_features]
+        Input feature matrix for ranking objective.
+    y : 1-d np.array of shape = [n_samples (or np.sum(group))]
+        Integer-graded relevance scores.
+    group_ids : 1-d np.array of shape = [n_samples (or np.sum(group))]
+        Array of group ids, each value indicates to which group each record belongs.
+    """
+    rnd_generator = check_random_state(random_state)
+    y_vec, group_id_vec = np.empty((0,), dtype=int), np.empty((0,), dtype=int)
+    gid = 0
+    # build target, group ID vectors.
+    relvalues = range(gmax + 1)
+    # build y/target and group-id vectors with user-specified group sizes.
+    if group is not None and hasattr(group, '__len__'):
+        n_samples = np.sum(group)
+        for i, gsize in enumerate(group):
+            y_vec = np.concatenate((y_vec, rnd_generator.choice(relvalues, size=gsize, replace=True)))
+            group_id_vec = np.concatenate((group_id_vec, [i] * gsize))
+    # build y/target and group-id vectors according to n_samples, avg_gs, and random_gs.
+    else:
+        while len(y_vec) < n_samples:
+            gsize = avg_gs if not random_gs else rnd_generator.poisson(avg_gs)
+            # groups should contain > 1 element for pairwise learning objective.
+            if gsize < 1:
+                continue
+            y_vec = np.append(y_vec, rnd_generator.choice(relvalues, size=gsize, replace=True))
+            group_id_vec = np.append(group_id_vec, [gid] * gsize)
+            gid += 1
+        y_vec, group_id_vec = y_vec[:n_samples], group_id_vec[:n_samples]
+    # build feature data, X. Transform first few into informative features.
+    n_informative = max(min(n_features, n_informative), 0)
+    X = rnd_generator.uniform(size=(n_samples, n_features))
+    for j in range(n_informative):
+        bias, coef = rnd_generator.normal(size=2)
+        X[:, j] = bias + coef * y_vec
+    return X, y_vec, group_id_vec