[Dataset] Adapter to convert a dataset for link prediction task (#3699)

* add ut * add doc link * install dep * add * Revert "install dep" This reverts commit e574a8377144749056c6849b655004df2771e179. * add * merge fix * rm files * fix * fix * fix * fix * fix typo * fix tf * fix * fix * fix * fix * fix * fix dependency * fix test * fix * fix * add doc * fix * fix * fix test * fix test Co-authored-by: Minjie Wang <wmjlyjemaine@gmail.com>

[Dataset] Adapter to convert a dataset for link prediction task (#3699)
* add ut * add doc link * install dep * add * Revert "install dep" This reverts commit e574a8377144749056c6849b655004df2771e179. * add * merge fix * rm files * fix * fix * fix * fix * fix typo * fix tf * fix * fix * fix * fix * fix * fix dependency * fix test * fix * fix * add doc * fix * fix * fix test * fix test Co-authored-by: Minjie Wang <wmjlyjemaine@gmail.com>
dc78e11c · Jinjing Zhou · GitHub · bc8f8b0b · dc78e11c · dc78e11c
Unverified Commit dc78e11c authored Feb 11, 2022 by Jinjing Zhou Committed by GitHub Feb 11, 2022
5 changed files
--- a/docs/source/api/python/dgl.data.rst
+++ b/docs/source/api/python/dgl.data.rst
@@ -234,6 +234,10 @@ Dataset adapters
    :members: __getitem__, __len__


+.. autoclass:: AsEdgePredDataset
+    :members: __getitem__, __len__
+
+
 Utilities
 -----------------


--- a/python/dgl/data/__init__.py
+++ b/python/dgl/data/__init__.py
@@ -30,7 +30,7 @@ from .rdf import AIFBDataset, MUTAGDataset, BGSDataset, AMDataset
 from .fraud import FraudDataset, FraudYelpDataset, FraudAmazonDataset
 from .fakenews import FakeNewsDataset
 from .csv_dataset import DGLCSVDataset
-from .adapter import AsNodePredDataset
+from .adapter import AsNodePredDataset, AsLinkPredDataset

 def register_data_args(parser):
    parser.add_argument(

--- a/python/dgl/data/adapter.py
+++ b/python/dgl/data/adapter.py
@@ -2,12 +2,17 @@

 import os
 import json
+import numpy as np

+from .. import backend as F
+from ..convert import graph as create_dgl_graph
+from ..sampling.negative import _calc_redundancy
 from .dgl_dataset import DGLDataset
 from . import utils
 from .. import backend as F

-__all__ = ['AsNodePredDataset']
+__all__ = ['AsNodePredDataset', 'AsLinkPredDataset']
+

 class AsNodePredDataset(DGLDataset):
    """Repurpose a dataset for a standard semi-supervised transductive
@@ -61,32 +66,43 @@ class AsNodePredDataset(DGLDataset):
    >>> print('train_mask' in new_ds[0].ndata)
    True
    """
+
    def __init__(self,
                 dataset,
-                 split_ratio=[0.8, 0.1, 0.1],
+                 split_ratio=None,
                 target_ntype=None,
                 **kwargs):
        self.g = dataset[0].clone()
        self.split_ratio = split_ratio
        self.target_ntype = target_ntype
        self.num_classes = getattr(dataset, 'num_classes', None)
-        super().__init__(dataset.name + '-as-nodepred', **kwargs)
+        super().__init__(dataset.name + '-as-nodepred',
+                         hash_key=(split_ratio, target_ntype), **kwargs)

    def process(self):
        if 'label' not in self.g.nodes[self.target_ntype].data:
            raise ValueError("Missing node labels. Make sure labels are stored "
                             "under name 'label'.")
-        if self.num_classes is None:
-            self.num_classes = len(F.unique(self.g.nodes[self.target_ntype].data['label']))
+        if self.split_ratio is None:
+            assert "train_mask" in self.g.nodes[self.target_ntype].data, \
+                "train_mask is not provided, please specify split_ratio to generate the masks"
+            assert "val_mask" in self.g.nodes[self.target_ntype].data, \
+                "val_mask is not provided, please specify split_ratio to generate the masks"
+            assert "test_mask" in self.g.nodes[self.target_ntype].data, \
+                "test_mask is not provided, please specify split_ratio to generate the masks"
+        else:
            if self.verbose:
                print('Generating train/val/test masks...')
            utils.add_nodepred_split(self, self.split_ratio, self.target_ntype)

+        if self.num_classes is None:
+            self.num_classes = len(F.unique(self.g.nodes[self.target_ntype].data['label']))
+
    def has_cache(self):
-        return os.path.isfile(os.path.join(self.save_path, 'graph.bin'))
+        return os.path.isfile(os.path.join(self.save_path, 'graph_{}.bin'.format(self.hash)))

    def load(self):
-        with open(os.path.join(self.save_path, 'info.json'), 'r') as f:
+        with open(os.path.join(self.save_path, 'info_{}.json'.format(self.hash)), 'r') as f:
            info = json.load(f)
            if (info['split_ratio'] != self.split_ratio
                    or info['target_ntype'] != self.target_ntype):
@@ -95,16 +111,200 @@ class AsNodePredDataset(DGLDataset):
            self.split_ratio = info['split_ratio']
            self.target_ntype = info['target_ntype']
            self.num_classes = info['num_classes']
-        gs, _ = utils.load_graphs(os.path.join(self.save_path, 'graph.bin'))
+        gs, _ = utils.load_graphs(os.path.join(self.save_path, 'graph_{}.bin'.format(self.hash)))
+        self.g = gs[0]
+
+    def save(self):
+        utils.save_graphs(os.path.join(self.save_path, 'graph_{}.bin'.format(self.hash)), [self.g])
+        with open(os.path.join(self.save_path, 'info_{}.json'.format(self.hash)), 'w') as f:
+            json.dump({
+                'split_ratio': self.split_ratio,
+                'target_ntype': self.target_ntype,
+                'num_classes': self.num_classes}, f)
+
+    def __getitem__(self, idx):
+        return self.g
+
+    def __len__(self):
+        return 1
+
+
+def negative_sample(g, num_samples):
+    """Random sample negative edges from graph, excluding self-loops,
+       the result samples might be less than num_samples
+    """
+    num_nodes = g.num_nodes()
+    redundancy = _calc_redundancy(
+        num_samples, g.num_edges(), num_nodes ** 2)
+    sample_size = int(num_samples*(1+redundancy))
+    edges = np.random.randint(0, num_nodes, size=(2, sample_size))
+    edges = np.unique(edges, axis=1)
+    # remove self loop
+    mask_self_loop = edges[0] == edges[1]
+    # remove existing edges
+    has_edges = F.asnumpy(g.has_edges_between(edges[0], edges[1]))
+    mask = ~(np.logical_or(mask_self_loop, has_edges))
+    edges = edges[:, mask]
+    if edges.shape[1] >= num_samples:
+        edges = edges[:, :num_samples]
+    return edges
+
+
+class AsLinkPredDataset(DGLDataset):
+    """Repurpose a dataset for link prediction task.
+
+    The created dataset will include data needed for link prediction.
+    Currently only support homogeneous graph. 
+    It will keep only the first graph in the provided dataset and
+    generate train/val/test edges according to the given split ratio,
+    and the correspondent negative edges based on the neg_ratio. The generated
+    edges will be cached to disk for fast re-loading. If the provided split ratio
+    differs from the cached one, it will re-process the dataset properly.
+
+    Parameters
+    ----------
+    dataset : DGLDataset
+        The dataset to be converted.
+    split_ratio : (float, float, float), optional
+        Split ratios for training, validation and test sets. Must sum to one.
+    neg_ratio : int, optional
+        Indicate how much negative samples to be sampled
+        The number of the negative samples will be neg_ratio * num_positive_edges.
+
+    Attributes
+    -------
+    feat_size: int
+        The size of the feature dimension in the graph
+    train_graph: DGLGraph
+        The DGLGraph for training
+    val_edges: Tuple[Tuple[Tensor, Tensor], Tuple[Tensor, Tensor]]
+        The validation set edges, encoded as
+        ((positive_edge_src, positive_edge_dst), (negative_edge_src, negative_edge_dst))
+    test_edges: Tuple[Tuple[Tensor, Tensor], Tuple[Tensor, Tensor]]
+        The test set edges, encoded as
+        ((positive_edge_src, positive_edge_dst), (negative_edge_src, negative_edge_dst))
+
+    Examples
+    --------
+    >>> ds = dgl.data.CoraGraphDataset()
+    >>> print(ds)
+    Dataset("cora_v2", num_graphs=1, save_path=...)
+    >>> new_ds = dgl.data.AsNodePredDataset(ds, [0.8, 0.1, 0.1])
+    >>> print(new_ds)
+    Dataset("cora_v2-as-edgepred", num_graphs=1, save_path=/home/ubuntu/.dgl/cora_v2-as-edgepred)
+    >>> print(hasattr(new_ds, "get_test_edges"))
+    True
+    """
+
+    def __init__(self,
+                 dataset,
+                 split_ratio=None,
+                 neg_ratio=3,
+                 **kwargs):
+        self.g = dataset[0]
+        self.num_nodes = self.g.num_nodes()
+        self.dataset = dataset
+        self.split_ratio = split_ratio
+        self.neg_ratio = neg_ratio
+        super().__init__(dataset.name + '-as-edgepred',
+                         hash_key=(neg_ratio, split_ratio), **kwargs)
+
+    def process(self):
+        if self.split_ratio is None:
+            assert hasattr(self.dataset, "get_edge_split"), \
+                "dataset doesn't have get_edge_split method, please specify split_ratio and neg_ratio to generate the split"
+            # This is likely to be an ogb dataset
+            self.edge_split = self.dataset.get_edge_split()
+            self._train_graph = self.g
+
+            pos_e_tensor, neg_e_tensor = self.edge_split["valid"][
+                "edge"], self.edge_split["valid"]["edge_neg"]
+            pos_e = (pos_e_tensor[:, 0], pos_e_tensor[:, 1])
+            neg_e = (neg_e_tensor[:, 0], neg_e_tensor[:, 1])
+            self._val_edges = pos_e, neg_e
+
+            pos_e_tensor, neg_e_tensor = self.edge_split["test"][
+                "edge"], self.edge_split["test"]["edge_neg"]
+            pos_e = (pos_e_tensor[:, 0], pos_e_tensor[:, 1])
+            neg_e = (neg_e_tensor[:, 0], neg_e_tensor[:, 1])
+            self._test_edges = pos_e, neg_e
+        else:
+            ratio = self.split_ratio
+            graph = self.dataset[0]
+            n = graph.num_edges()
+            src, dst = graph.edges()
+            src, dst = F.asnumpy(src), F.asnumpy(dst)
+            n_train, n_val, n_test = int(
+                n * ratio[0]), int(n * ratio[1]), int(n * ratio[2])
+
+            idx = np.random.permutation(n)
+            train_pos_idx = idx[:n_train]
+            val_pos_idx = idx[n_train:n_train+n_val]
+            test_pos_idx = idx[n_train+n_val:]
+            neg_src, neg_dst = negative_sample(
+                graph, self.neg_ratio*(n_val+n_test))
+            neg_n_val, neg_n_test = self.neg_ratio * n_val, self.neg_ratio * n_test
+            neg_val_src, neg_val_dst = neg_src[:neg_n_val], neg_dst[:neg_n_val]
+            neg_test_src, neg_test_dst = neg_src[neg_n_val:], neg_dst[neg_n_val:]
+            self._val_edges = (F.tensor(src[val_pos_idx]), F.tensor(dst[val_pos_idx])
+                              ), (F.tensor(neg_val_src), F.tensor(neg_val_dst))
+            self._test_edges = (F.tensor(src[test_pos_idx]),
+                               F.tensor(dst[test_pos_idx])), (F.tensor(neg_test_src), F.tensor(neg_test_dst))
+            self._train_graph = create_dgl_graph(
+                (src[train_pos_idx], dst[train_pos_idx]), num_nodes=self.num_nodes)
+            self._train_graph.ndata["feat"] = graph.ndata["feat"]
+
+    def has_cache(self):
+        return os.path.isfile(os.path.join(self.save_path, 'graph_{}.bin'.format(self.hash)))
+
+    def load(self):
+        gs, tensor_dict = utils.load_graphs(
+            os.path.join(self.save_path, 'graph_{}.bin'.format(self.hash)))
        self.g = gs[0]
+        self._train_graph = self.g
+        self._val_edges = (tensor_dict["val_pos_src"], tensor_dict["val_pos_dst"]), (
+            tensor_dict["val_neg_src"], tensor_dict["val_neg_dst"])
+        self._test_edges = (tensor_dict["test_pos_src"], tensor_dict["test_pos_dst"]), (
+            tensor_dict["test_neg_src"], tensor_dict["test_neg_dst"])
+
+        with open(os.path.join(self.save_path, 'info_{}.json'.format(self.hash)), 'r') as f:
+            info = json.load(f)
+            self.split_ratio = info["split_ratio"]
+            self.neg_ratio = info["neg_ratio"]

    def save(self):
-        utils.save_graphs(os.path.join(self.save_path, 'graph.bin'), [self.g])
-        with open(os.path.join(self.save_path, 'info.json'), 'w') as f:
+        tensor_dict = {
+            "val_pos_src": self._val_edges[0][0],
+            "val_pos_dst": self._val_edges[0][1],
+            "val_neg_src": self._val_edges[1][0],
+            "val_neg_dst": self._val_edges[1][1],
+            "test_pos_src": self._test_edges[0][0],
+            "test_pos_dst": self._test_edges[0][1],
+            "test_neg_src": self._test_edges[1][0],
+            "test_neg_dst": self._test_edges[1][1],
+        }
+        utils.save_graphs(os.path.join(self.save_path, 'graph_{}.bin'.format(self.hash)), [
+                          self._train_graph], tensor_dict)
+        with open(os.path.join(self.save_path, 'info_{}.json'.format(self.hash)), 'w') as f:
            json.dump({
-                'split_ratio' : self.split_ratio,
-                'target_ntype' : self.target_ntype,
-                'num_classes' : self.num_classes}, f)
+                'split_ratio': self.split_ratio,
+                'neg_ratio': self.neg_ratio}, f)
+
+    @property
+    def feat_size(self):
+        return self._train_graph.ndata["feat"].shape[-1]
+
+    @property
+    def train_graph(self):
+        return self._train_graph
+
+    @property
+    def val_edges(self):
+        return self._val_edges
+
+    @property
+    def test_edges(self):
+        return self._test_edges

    def __getitem__(self, idx):
        return self.g

--- a/tests/compute/test_data.py
+++ b/tests/compute/test_data.py
@@ -9,7 +9,7 @@ import yaml
 import pytest
 import dgl.data as data
 from dgl import DGLError
-
+import dgl

 @unittest.skipIf(F._default_context_str == 'gpu', reason="Datasets don't need to be tested on GPU.")
 def test_minigc():
@@ -1067,6 +1067,33 @@ def test_as_nodepred2():
    ds = data.AsNodePredDataset(data.AIFBDataset(), [0.1, 0.1, 0.8], 'Personen', verbose=True)
    assert F.sum(F.astype(ds[0].nodes['Personen'].data['train_mask'], F.int32), 0) == int(ds[0].num_nodes('Personen') * 0.1)

+
+
+@unittest.skipIf(F._default_context_str == 'gpu', reason="Datasets don't need to be tested on GPU.")
+def test_as_linkpred():
+    # create
+    ds = data.AsLinkPredDataset(data.CoraGraphDataset(), split_ratio=[0.8, 0.1, 0.1], neg_ratio=1, verbose=True)
+    # Cora has 10556 edges, 10% test edges can be 1057
+    assert ds.test_edges[0][0].shape[0] == 1057
+    # negative samples, not guaranteed, so the assert is in a relaxed range
+    assert 1000 <= ds.test_edges[1][0].shape[0] <= 1057
+    # read from cache
+    ds = data.AsLinkPredDataset(data.CoraGraphDataset(), split_ratio=[0.7, 0.1, 0.2], neg_ratio=2, verbose=True)
+    assert ds.test_edges[0][0].shape[0] == 2112
+    # negative samples, not guaranteed to be ratio 2, so the assert is in a relaxed range
+    assert 4000 < ds.test_edges[1][0].shape[0] <= 4224
+
+
+@unittest.skipIf(dgl.backend.backend_name != 'pytorch', reason="ogb only supports pytorch")
+def test_as_linkpred_ogb():
+    from ogb.linkproppred import DglLinkPropPredDataset
+    ds = data.AsLinkPredDataset(DglLinkPropPredDataset("ogbl-collab"), split_ratio=None, verbose=True)
+    # original dataset has 46329 test edges
+    assert ds.test_edges[0][0].shape[0] == 46329
+    # force generate new split
+    ds = data.AsLinkPredDataset(DglLinkPropPredDataset("ogbl-collab"), split_ratio=[0.7, 0.2, 0.1], verbose=True)
+    assert ds.test_edges[0][0].shape[0] == 235812
+
 @unittest.skipIf(F._default_context_str == 'gpu', reason="Datasets don't need to be tested on GPU.")
 def test_as_nodepred_csvdataset():
    with tempfile.TemporaryDirectory() as test_dir:
@@ -1103,7 +1130,7 @@ def test_as_nodepred_csvdataset():
        assert 'label' in ds[0].ndata
        assert 'train_mask' not in ds[0].ndata
        assert not hasattr(ds[0], 'num_classes')
-        new_ds = data.AsNodePredDataset(ds, force_reload=True)
+        new_ds = data.AsNodePredDataset(ds, split_ratio=[0.8, 0.1, 0.1], force_reload=True)
        assert new_ds.num_classes == num_classes
        assert 'feat' in new_ds[0].ndata
        assert 'label' in new_ds[0].ndata

--- a/tests/scripts/task_unit_test.sh
+++ b/tests/scripts/task_unit_test.sh
@@ -32,7 +32,7 @@ fi

 conda activate ${DGLBACKEND}-ci

-python3 -m pip install pytest pyyaml pandas pydantic rdflib || EXIT /B 1
+python3 -m pip install pytest pyyaml pandas pydantic rdflib ogb || fail "pip install"
 python3 -m pytest -v --junitxml=pytest_compute.xml tests/compute || fail "compute"
 python3 -m pytest -v --junitxml=pytest_backend.xml tests/$DGLBACKEND || fail "backend-specific"