[Graophbolt] Add utils for sample (#5990)

Co-authored-by: Ubuntu <ubuntu@ip-172-31-16-19.ap-northeast-1.compute.internal> Co-authored-by: Hongzhi (Steve), Chen <chenhongzhi.nkcs@gmail.com>

[Graophbolt] Add utils for sample (#5990)
Co-authored-by: Ubuntu <ubuntu@ip-172-31-16-19.ap-northeast-1.compute.internal> Co-authored-by: Hongzhi (Steve), Chen <chenhongzhi.nkcs@gmail.com>
02e79a3d · peizhou001 · GitHub · 6519ec27 · 02e79a3d · 02e79a3d
Unverified Commit 02e79a3d authored Jul 14, 2023 by peizhou001 Committed by GitHub Jul 14, 2023
4 changed files
--- a/python/dgl/graphbolt/__init__.py
+++ b/python/dgl/graphbolt/__init__.py
@@ -15,6 +15,7 @@ from .impl import *
 from .dataloader import *
 from .subgraph_sampler import *
 from .sampled_subgraph import *
+from .utils import unique_and_compact_node_pairs


 def load_graphbolt():

--- a/python/dgl/graphbolt/utils/__init__.py
+++ b/python/dgl/graphbolt/utils/__init__.py
 """Utility functions for GraphBolt."""
 from .internal import *
+from .sample_utils import *
--- a/python/dgl/graphbolt/utils/sample_utils.py
+++ b/python/dgl/graphbolt/utils/sample_utils.py
+"""Utility functions for sampling."""
+
+from collections import defaultdict
+from typing import Dict, Tuple, Union
+
+import torch
+
+
+def unique_and_compact_node_pairs(
+    node_pairs: Union[
+        Tuple[torch.Tensor, torch.Tensor],
+        Dict[Tuple[str, str, str], Tuple[torch.Tensor, torch.Tensor]],
+    ]
+):
+    """
+    Compact node pairs and return unique nodes (per type).
+
+    Parameters
+    ----------
+    node_pairs : Tuple[torch.Tensor, torch.Tensor] or \
+        Dict(Tuple[str, str, str], Tuple[torch.Tensor, torch.Tensor])
+        Node pairs representing source-destination edges.
+        - If `node_pairs` is a tuple: It means the graph is homogeneous.
+        Also, it should be in the format ('u', 'v') representing source
+        and destination pairs. And IDs inside are homogeneous ids.
+        - If `node_pairs` is a dictionary: The keys should be edge type and
+        the values should be corresponding node pairs. And IDs inside are
+        heterogeneous ids.
+
+    Returns
+    -------
+    Tuple[node_pairs, unique_nodes]
+        The compacted node pairs, where node IDs are replaced with mapped node
+        IDs, and the unique nodes (per type).
+        "Compacted node pairs" indicates that the node IDs in the input node
+        pairs are replaced with mapped node IDs, where each type of node is
+        mapped to a contiguous space of IDs ranging from 0 to N.
+
+    Examples
+    --------
+    >>> import dgl.graphbolt as gb
+    >>> N1 = torch.LongTensor([1, 2, 2])
+    >>> N2 = torch.LongTensor([5, 6, 5])
+    >>> node_pairs = {("n1", "e1", "n2"): (N1, N2),
+    ...     ("n2", "e2", "n1"): (N2, N1)}
+    >>> unique_nodes, compacted_node_pairs = gb.unique_and_compact_node_pairs(
+    ...     node_pairs
+    ... )
+    >>> print(unique_nodes)
+    {'n1': tensor([1, 2]), 'n2': tensor([5, 6])}
+    >>> print(compacted_node_pairs)
+    {('n1', 'e1', 'n2'): (tensor([0, 1, 1]), tensor([0, 1, 0])),
+    ('n2', 'e2', 'n1'): (tensor([0, 1, 0]), tensor([0, 1, 1]))}
+    """
+    is_homogeneous = not isinstance(node_pairs, Dict)
+    if is_homogeneous:
+        node_pairs = {("_N", "_E", "_N"): node_pairs}
+    nodes_dict = defaultdict(list)
+    # Collect nodes for each node type.
+    for etype, node_pair in node_pairs.items():
+        u_type, _, v_type = etype
+        u, v = node_pair
+        nodes_dict[u_type].append(u)
+        nodes_dict[v_type].append(v)
+
+    unique_nodes_dict = {}
+    inverse_indices_dict = {}
+    for ntype, nodes in nodes_dict.items():
+        collected_nodes = torch.cat(nodes)
+        # Compact and find unique nodes.
+        unique_nodes, inverse_indices = torch.unique(
+            collected_nodes,
+            return_inverse=True,
+        )
+        unique_nodes_dict[ntype] = unique_nodes
+        inverse_indices_dict[ntype] = inverse_indices
+
+    # Map back in same order as collect.
+    compacted_node_pairs = {}
+    unique_nodes = unique_nodes_dict
+    for etype, node_pair in node_pairs.items():
+        u_type, _, v_type = etype
+        u, v = node_pair
+        u_size, v_size = u.numel(), v.numel()
+        u = inverse_indices_dict[u_type][:u_size]
+        inverse_indices_dict[u_type] = inverse_indices_dict[u_type][u_size:]
+        v = inverse_indices_dict[v_type][:v_size]
+        inverse_indices_dict[v_type] = inverse_indices_dict[v_type][v_size:]
+        compacted_node_pairs[etype] = (u, v)
+
+    # Return singleton for homogeneous graph.
+    if is_homogeneous:
+        compacted_node_pairs = list(compacted_node_pairs.values())[0]
+        unique_nodes = list(unique_nodes_dict.values())[0]
+    return unique_nodes, compacted_node_pairs
--- a/tests/python/pytorch/graphbolt/test_graphbolt_utils.py
+++ b/tests/python/pytorch/graphbolt/test_graphbolt_utils.py
+import dgl.graphbolt as gb
+import torch
+
+
+def test_unique_and_compact_node_pairs_hetero():
+    N1 = torch.randint(0, 50, (30,))
+    N2 = torch.randint(0, 50, (20,))
+    N3 = torch.randint(0, 50, (10,))
+    unique_N1, compacted_N1 = torch.unique(N1, return_inverse=True)
+    unique_N2, compacted_N2 = torch.unique(N2, return_inverse=True)
+    unique_N3, compacted_N3 = torch.unique(N3, return_inverse=True)
+    expected_unique_nodes = {
+        "n1": unique_N1,
+        "n2": unique_N2,
+        "n3": unique_N3,
+    }
+    expected_compacted_pairs = {
+        ("n1", "e1", "n2"): (
+            compacted_N1[:20],
+            compacted_N2,
+        ),
+        ("n1", "e2", "n3"): (
+            compacted_N1[20:30],
+            compacted_N3,
+        ),
+        ("n2", "e3", "n3"): (
+            compacted_N2[10:],
+            compacted_N3,
+        ),
+    }
+    node_pairs = {
+        ("n1", "e1", "n2"): (
+            N1[:20],
+            N2,
+        ),
+        ("n1", "e2", "n3"): (
+            N1[20:30],
+            N3,
+        ),
+        ("n2", "e3", "n3"): (
+            N2[10:],
+            N3,
+        ),
+    }
+
+    unique_nodes, compacted_node_pairs = gb.unique_and_compact_node_pairs(
+        node_pairs
+    )
+    for etype, pair in compacted_node_pairs.items():
+        expected_u, expected_v = expected_compacted_pairs[etype]
+        u, v = pair
+        assert torch.equal(u, expected_u)
+        assert torch.equal(v, expected_v)
+    for ntype, nodes in unique_nodes.items():
+        expected_nodes = expected_unique_nodes[ntype]
+        assert torch.equal(nodes, expected_nodes)
+
+
+def test_unique_and_compact_node_pairs_homo():
+    N = torch.randint(0, 50, (20,))
+    expected_unique_N, compacted_N = torch.unique(N, return_inverse=True)
+    expected_compacted_pairs = tuple(compacted_N.split(10))
+
+    node_pairs = tuple(N.split(10))
+    unique_nodes, compacted_node_pairs = gb.unique_and_compact_node_pairs(
+        node_pairs
+    )
+    expected_u, expected_v = expected_compacted_pairs
+    u, v = compacted_node_pairs
+    assert torch.equal(u, expected_u)
+    assert torch.equal(v, expected_v)
+    assert torch.equal(unique_nodes, expected_unique_N)