[Feature] Converting a block to a graph and allow block construction (#2555)

* first commit * more commit * fix and docstring * add to api reference * address comments * Update convert.py

[Feature] Converting a block to a graph and allow block construction (#2555)
* first commit * more commit * fix and docstring * add to api reference * address comments * Update convert.py
2288098b · Quan (Andy) Gan · GitHub · 7ff2662b · 2288098b · 2288098b
Unverified Commit 2288098b authored Jan 24, 2021 by Quan (Andy) Gan Committed by GitHub Jan 24, 2021
4 changed files
--- a/docs/source/api/python/dgl.rst
+++ b/docs/source/api/python/dgl.rst
@@ -26,6 +26,8 @@ Operators for constructing :class:`DGLGraph` from raw data formats.
    rand_bipartite
    knn_graph
    segmented_knn_graph
+    create_block
+    block_to_graph
 .. _api-subgraph-extraction:

--- a/python/dgl/convert.py
+++ b/python/dgl/convert.py
 """Module for converting graph from/to other object."""
 from collections import defaultdict
+from collections.abc import Mapping
 from scipy.sparse import spmatrix
 import numpy as np
 import networkx as nx
 from . import backend as F
 from . import heterograph_index
-from .heterograph import DGLHeteroGraph, combine_frames
+from .heterograph import DGLHeteroGraph, combine_frames, DGLBlock
 from . import graph_index
 from . import utils
 from .base import NTYPE, ETYPE, NID, EID, DGLError, dgl_warning
@@ -17,6 +18,8 @@ __all__ = [
    'hetero_from_relations',
    'hetero_from_shared_memory',
    'heterograph',
+    'create_block',
+    'block_to_graph',
    'to_heterogeneous',
    'to_hetero',
    'to_homogeneous',
@@ -321,7 +324,7 @@ def heterograph(data_dict,
            if num_nodes_dict[dty] < vrange:
                raise DGLError('The given number of nodes of node type {} must be larger than'
                               ' the max ID in the data, but got {} and {}.'.format(
-                                   sty, num_nodes_dict[dty], vrange - 1))
+                                   dty, num_nodes_dict[dty], vrange - 1))
    # Create the graph
    # Sort the ntypes and relation tuples to have a deterministic order for the same set
@@ -354,6 +357,237 @@ def heterograph(data_dict,
    return retg.to(device)
+def create_block(data_dict, num_src_nodes=None, num_dst_nodes=None, idtype=None, device=None):
+    """Create a :class:`DGLBlock` object.
+    Parameters
+    ----------
+    data_dict : graph data
+        The dictionary data for constructing a block. The keys are in the form of
+        string triplets (src_type, edge_type, dst_type), specifying the input node type,
+        edge type, and output node type. The values are graph data in the form of
+        :math:`(U, V)`, where :math:`(U[i], V[i])` forms the edge with ID :math:`i`.
+        The allowed graph data formats are:
+        - ``(Tensor, Tensor)``: Each tensor must be a 1D tensor containing node IDs. DGL calls
+          this format "tuple of node-tensors". The tensors should have the same data type,
+          which must be either int32 or int64. They should also have the same device context
+          (see below the descriptions of :attr:`idtype` and :attr:`device`).
+        - ``(iterable[int], iterable[int])``: Similar to the tuple of node-tensors
+          format, but stores node IDs in two sequences (e.g. list, tuple, numpy.ndarray).
+        If you would like to create a block with a single input node type, a single output
+        node type, and a single edge type, then you can pass in the graph data directly
+        without wrapping it as a dictionary.
+    num_src_nodes : dict[str, int] or int, optional
+        The number of nodes for each input node type, which is a dictionary mapping a node type
+        :math:`T` to the number of :math:`T`-typed input nodes.
+        If not given for a node type :math:`T`, DGL finds the largest ID appearing in *every*
+        graph data whose input node type is :math:`T`, and sets the number of nodes to
+        be that ID plus one. If given and the value is no greater than the largest ID for some
+        input node type, DGL will raise an error. By default, DGL infers the number of nodes for
+        all input node types.
+        If you would like to create a block with a single input node type, a single output
+        node type, and a single edge type, then you can pass in an integer to directly
+        represent the number of input nodes.
+    num_dst_nodes : dict[str, int] or int, optional
+        The number of nodes for each output node type, which is a dictionary mapping a node type
+        :math:`T` to the number of :math:`T`-typed output nodes.
+        If not given for a node type :math:`T`, DGL finds the largest ID appearing in *every*
+        graph data whose output node type is :math:`T`, and sets the number of nodes to
+        be that ID plus one. If given and the value is no greater than the largest ID for some
+        output node type, DGL will raise an error. By default, DGL infers the number of nodes for
+        all output node types.
+        If you would like to create a block with a single output node type, a single output
+        node type, and a single edge type, then you can pass in an integer to directly
+        represent the number of output nodes.
+    idtype : int32 or int64, optional
+        The data type for storing the structure-related graph information such as node and
+        edge IDs. It should be a framework-specific data type object (e.g., ``torch.int32``).
+        If ``None`` (default), DGL infers the ID type from the :attr:`data_dict` argument.
+    device : device context, optional
+        The device of the returned graph, which should be a framework-specific device object
+        (e.g., ``torch.device``). If ``None`` (default), DGL uses the device of the tensors of
+        the :attr:`data` argument. If :attr:`data` is not a tuple of node-tensors, the
+        returned graph is on CPU.  If the specified :attr:`device` differs from that of the
+        provided tensors, it casts the given tensors to the specified device first.
+    Returns
+    -------
+    DGLBlock
+        The created block.
+    Notes
+    -----
+    1. If the :attr:`idtype` argument is not given then:
+       - in the case of the tuple of node-tensor format, DGL uses
+         the data type of the given ID tensors.
+       - in the case of the tuple of sequence format, DGL uses int64.
+       Once the graph has been created, you can change the data type by using
+       :func:`dgl.DGLGraph.long` or :func:`dgl.DGLGraph.int`.
+       If the specified :attr:`idtype` argument differs from the data type of the provided
+       tensors, it casts the given tensors to the specified data type first.
+    2. The most efficient construction approach is to provide a tuple of node tensors without
+       specifying :attr:`idtype` and :attr:`device`. This is because the returned graph shares
+       the storage with the input node-tensors in this case.
+    3. DGL internally maintains multiple copies of the graph structure in different sparse
+       formats and chooses the most efficient one depending on the computation invoked.
+       If memory usage becomes an issue in the case of large graphs, use
+       :func:`dgl.DGLGraph.formats` to restrict the allowed formats.
+    4. DGL internally decides a deterministic order for the same set of node types and canonical
+       edge types, which does not necessarily follow the order in :attr:`data_dict`.
+    Examples
+    --------
+    The following example uses PyTorch backend.
+    >>> import dgl
+    >>> block = dgl.create_block(([0, 1, 2], [1, 2, 3]), num_src_nodes=3, num_dst_nodes=4)
+    >>> block
+    Block(num_src_nodes=3, num_dst_nodes=4, num_edges=3)
+    >>> block = dgl.create_block({
+    ...     ('A', 'AB', 'B'): ([1, 2, 3], [2, 1, 0]),
+    ...     ('B', 'BA', 'A'): ([2, 1], [2, 3])},
+    ...     num_src_nodes={'A': 6, 'B': 5},
+    ...     num_dst_nodes={'A': 4, 'B': 3})
+    >>> block
+    Block(num_src_nodes={'A': 6, 'B': 5},
+          num_dst_nodes={'A': 4, 'B': 3},
+          num_edges={('A', 'AB', 'B'): 3, ('B', 'BA', 'A'): 2},
+          metagraph=[('A', 'B', 'AB'), ('B', 'A', 'BA')])
+    See also
+    --------
+    to_block
+    """
+    need_infer = num_src_nodes is None and num_dst_nodes is None
+    if not isinstance(data_dict, Mapping):
+        data_dict = {('_N', '_E', '_N'): data_dict}
+        if not need_infer:
+            assert isinstance(num_src_nodes, int), \
+                "num_src_nodes must be a pair of integers if data_dict is not a dict"
+            assert isinstance(num_dst_nodes, int), \
+                "num_dst_nodes must be a pair of integers if data_dict is not a dict"
+            num_src_nodes = {'_N': num_src_nodes}
+            num_dst_nodes = {'_N': num_dst_nodes}
+    else:
+        if not need_infer:
+            assert isinstance(num_src_nodes, Mapping), \
+                "num_src_nodes must be a dict if data_dict is a dict"
+            assert isinstance(num_dst_nodes, Mapping), \
+                "num_dst_nodes must be a dict if data_dict is a dict"
+    if need_infer:
+        num_src_nodes = defaultdict(int)
+        num_dst_nodes = defaultdict(int)
+    # Convert all data to node tensors first
+    node_tensor_dict = {}
+    for (sty, ety, dty), data in data_dict.items():
+        u, v, urange, vrange = utils.graphdata2tensors(data, idtype, bipartite=True)
+        node_tensor_dict[(sty, ety, dty)] = (u, v)
+        if need_infer:
+            num_src_nodes[sty] = max(num_src_nodes[sty], urange)
+            num_dst_nodes[dty] = max(num_dst_nodes[dty], vrange)
+        else:  # sanity check
+            if num_src_nodes[sty] < urange:
+                raise DGLError('The given number of nodes of input node type {} must be larger'
+                               ' than the max ID in the data, but got {} and {}.'.format(
+                                   sty, num_src_nodes[sty], urange - 1))
+            if num_dst_nodes[dty] < vrange:
+                raise DGLError('The given number of nodes of output node type {} must be larger'
+                               ' than the max ID in the data, but got {} and {}.'.format(
+                                   dty, num_dst_nodes[dty], vrange - 1))
+    # Create the graph
+    # Sort the ntypes and relation tuples to have a deterministic order for the same set
+    # of type names.
+    srctypes = list(sorted(num_src_nodes.keys()))
+    dsttypes = list(sorted(num_dst_nodes.keys()))
+    relations = list(sorted(node_tensor_dict.keys()))
+    num_nodes_per_type = utils.toindex(
+        [num_src_nodes[ntype] for ntype in srctypes] +
+        [num_dst_nodes[ntype] for ntype in dsttypes], "int64")
+    srctype_dict = {ntype: i for i, ntype in enumerate(srctypes)}
+    dsttype_dict = {ntype: i + len(srctypes) for i, ntype in enumerate(dsttypes)}
+    meta_edges_src = []
+    meta_edges_dst = []
+    etypes = []
+    rel_graphs = []
+    for srctype, etype, dsttype in relations:
+        meta_edges_src.append(srctype_dict[srctype])
+        meta_edges_dst.append(dsttype_dict[dsttype])
+        etypes.append(etype)
+        src, dst = node_tensor_dict[(srctype, etype, dsttype)]
+        g = create_from_edges(src, dst, 'SRC/' + srctype, etype, 'DST/' + dsttype,
+                              num_src_nodes[srctype], num_dst_nodes[dsttype])
+        rel_graphs.append(g)
+    # metagraph is DGLGraph, currently still using int64 as index dtype
+    metagraph = graph_index.from_coo(
+        len(srctypes) + len(dsttypes), meta_edges_src, meta_edges_dst, True)
+    # create graph index
+    hgidx = heterograph_index.create_heterograph_from_relations(
+        metagraph, [rgrh._graph for rgrh in rel_graphs], num_nodes_per_type)
+    retg = DGLBlock(hgidx, (srctypes, dsttypes), etypes)
+    return retg.to(device)
+def block_to_graph(block):
+    """Convert a :class:`DGLBlock` object to a :class:`DGLGraph`.
+    DGL will rename all the input node types by suffixing with ``_src``, and
+    all the output node types by suffixing with ``_dst``.
+    Features on the returned graph will be preserved.
+    Parameters
+    ----------
+    block : DGLBlock
+        The block.
+    Returns
+    -------
+    DGLGraph
+        The graph.
+    Examples
+    --------
+    >>> block = dgl.create_block({
+    ...     ('A', 'AB', 'B'): ([1, 2, 3], [2, 1, 0]),
+    ...     ('B', 'BA', 'A'): ([2, 1], [2, 3])})
+    >>> g = dgl.block_to_graph(block)
+    >>> g
+    Graph(num_nodes={'A_src': 4, 'B_src': 3, 'A_dst': 4, 'B_dst': 3},
+          num_edges={('A_src', 'AB', 'B_dst'): 3, ('B_src', 'BA', 'A_dst'): 2},
+          metagraph=[('A_src', 'B_dst', 'AB'), ('B_src', 'A_dst', 'BA')])
+    """
+    new_types = [ntype + '_src' for ntype in block.srctypes] + \
+                [ntype + '_dst' for ntype in block.dsttypes]
+    retg = DGLHeteroGraph(block._graph, new_types, block.etypes)
+    for srctype in block.srctypes:
+        retg.nodes[srctype + '_src'].data.update(block.srcnodes[srctype].data)
+    for dsttype in block.dsttypes:
+        retg.nodes[dsttype + '_dst'].data.update(block.dstnodes[dsttype].data)
+    for srctype, etype, dsttype in block.canonical_etypes:
+        retg.edges[srctype + '_src', etype, dsttype + '_dst'].data.update(
+            block.edges[srctype, etype, dsttype].data)
+    return retg
 def to_heterogeneous(G, ntypes, etypes, ntype_field=NTYPE,
                     etype_field=ETYPE, metagraph=None):
    """Convert a homogeneous graph to a heterogeneous graph and return.

--- a/python/dgl/transform.py
+++ b/python/dgl/transform.py
@@ -1591,10 +1591,10 @@ def to_block(g, dst_nodes=None, include_dst_in_src=True):
    ``etype`` connecting from node ID ``u`` of type ``utype`` in the input side to node
    ID ``v`` of type ``vtype`` in the output side.
-    The output nodes of the block will only contain the nodes that have at least one
+    For blocks returned by :func:`to_block`, the output nodes of the block will only
-    inbound edge of any type.  The input nodes of the block will only contain the nodes
+    contain the nodes that have at least one inbound edge of any type.  The input nodes
-    that appear in the output nodes, as well as the nodes that have at least one outbound
+    of the block will only contain the nodes that appear in the output nodes, as well
-    edge connecting to one of the output nodes.
+    as the nodes that have at least one outbound edge connecting to one of the output nodes.
    If the :attr:`dst_nodes` argument is not None, it specifies the output nodes instead.
@@ -1637,6 +1637,8 @@ def to_block(g, dst_nodes=None, include_dst_in_src=True):
    :ref:`guide-minibatch` for a more thorough discussion about the methodology
    of stochastic training.
+    See also :func:`create_block` for more flexible construction of blocks.
    Examples
    --------
    Converting a homogeneous graph to a block as described above:
@@ -1705,6 +1707,10 @@ def to_block(g, dst_nodes=None, include_dst_in_src=True):
    >>> block.srcnodes['A'].data[dgl.NID]
    tensor([2, 1])
+    See also
+    --------
+    create_block
    """
    assert g.device == F.cpu(), 'the graph must be on CPU'

--- a/tests/compute/test_heterograph.py
+++ b/tests/compute/test_heterograph.py
@@ -2503,6 +2503,117 @@ def test_frame_device(idtype):
    assert F.context(ng._node_frames[0]._columns['hh'].storage) == F.ctx()
    assert F.context(ng._edge_frames[0]._columns['h'].storage) == F.cpu()
+@parametrize_dtype
+def test_create_block(idtype):
+    block = dgl.create_block(([0, 1, 2], [1, 2, 3]), idtype=idtype, device=F.ctx())
+    assert block.num_src_nodes() == 3
+    assert block.num_dst_nodes() == 4
+    assert block.num_edges() == 3
+    block = dgl.create_block(([], []), idtype=idtype, device=F.ctx())
+    assert block.num_src_nodes() == 0
+    assert block.num_dst_nodes() == 0
+    assert block.num_edges() == 0
+    block = dgl.create_block(([], []), 3, 4, idtype=idtype, device=F.ctx())
+    assert block.num_src_nodes() == 3
+    assert block.num_dst_nodes() == 4
+    assert block.num_edges() == 0
+    block = dgl.create_block(([0, 1, 2], [1, 2, 3]), 4, 5, idtype=idtype, device=F.ctx())
+    assert block.num_src_nodes() == 4
+    assert block.num_dst_nodes() == 5
+    assert block.num_edges() == 3
+    sx = F.randn((4, 5))
+    dx = F.randn((5, 6))
+    ex = F.randn((3, 4))
+    block.srcdata['x'] = sx
+    block.dstdata['x'] = dx
+    block.edata['x'] = ex
+    g = dgl.block_to_graph(block)
+    assert g.num_src_nodes() == 4
+    assert g.num_dst_nodes() == 5
+    assert g.num_edges() == 3
+    assert g.srcdata['x'] is sx
+    assert g.dstdata['x'] is dx
+    assert g.edata['x'] is ex
+    block = dgl.create_block({
+        ('A', 'AB', 'B'): ([1, 2, 3], [2, 1, 0]),
+        ('B', 'BA', 'A'): ([2, 3], [3, 4])},
+        idtype=idtype, device=F.ctx())
+    assert block.num_src_nodes('A') == 4
+    assert block.num_src_nodes('B') == 4
+    assert block.num_dst_nodes('B') == 3
+    assert block.num_dst_nodes('A') == 5
+    assert block.num_edges('AB') == 3
+    assert block.num_edges('BA') == 2
+    block = dgl.create_block({
+        ('A', 'AB', 'B'): ([], []),
+        ('B', 'BA', 'A'): ([], [])},
+        idtype=idtype, device=F.ctx())
+    assert block.num_src_nodes('A') == 0
+    assert block.num_src_nodes('B') == 0
+    assert block.num_dst_nodes('B') == 0
+    assert block.num_dst_nodes('A') == 0
+    assert block.num_edges('AB') == 0
+    assert block.num_edges('BA') == 0
+    block = dgl.create_block({
+        ('A', 'AB', 'B'): ([], []),
+        ('B', 'BA', 'A'): ([], [])},
+        num_src_nodes={'A': 5, 'B': 5},
+        num_dst_nodes={'A': 6, 'B': 4},
+        idtype=idtype, device=F.ctx())
+    assert block.num_src_nodes('A') == 5
+    assert block.num_src_nodes('B') == 5
+    assert block.num_dst_nodes('B') == 4
+    assert block.num_dst_nodes('A') == 6
+    assert block.num_edges('AB') == 0
+    assert block.num_edges('BA') == 0
+    block = dgl.create_block({
+        ('A', 'AB', 'B'): ([1, 2, 3], [2, 1, 0]),
+        ('B', 'BA', 'A'): ([2, 3], [3, 4])},
+        num_src_nodes={'A': 5, 'B': 5},
+        num_dst_nodes={'A': 6, 'B': 4},
+        idtype=idtype, device=F.ctx())
+    assert block.num_src_nodes('A') == 5
+    assert block.num_src_nodes('B') == 5
+    assert block.num_dst_nodes('B') == 4
+    assert block.num_dst_nodes('A') == 6
+    assert block.num_edges(('A', 'AB', 'B')) == 3
+    assert block.num_edges(('B', 'BA', 'A')) == 2
+    sax = F.randn((5, 3))
+    sbx = F.randn((5, 4))
+    dax = F.randn((6, 5))
+    dbx = F.randn((4, 6))
+    eabx = F.randn((3, 7))
+    ebax = F.randn((2, 8))
+    block.srcnodes['A'].data['x'] = sax
+    block.srcnodes['B'].data['x'] = sbx
+    block.dstnodes['A'].data['x'] = dax
+    block.dstnodes['B'].data['x'] = dbx
+    block.edges['AB'].data['x'] = eabx
+    block.edges['BA'].data['x'] = ebax
+    hg = dgl.block_to_graph(block)
+    assert hg.num_nodes('A_src') == 5
+    assert hg.num_nodes('B_src') == 5
+    assert hg.num_nodes('A_dst') == 6
+    assert hg.num_nodes('B_dst') == 4
+    assert hg.num_edges(('A_src', 'AB', 'B_dst')) == 3
+    assert hg.num_edges(('B_src', 'BA', 'A_dst')) == 2
+    assert hg.nodes['A_src'].data['x'] is sax
+    assert hg.nodes['B_src'].data['x'] is sbx
+    assert hg.nodes['A_dst'].data['x'] is dax
+    assert hg.nodes['B_dst'].data['x'] is dbx
+    assert hg.edges['AB'].data['x'] is eabx
+    assert hg.edges['BA'].data['x'] is ebax
 if __name__ == '__main__':
@@ -2541,4 +2652,5 @@ if __name__ == '__main__':
    #test_frame(F.int32)
    #test_frame_device(F.int32)
    #test_empty_query(F.int32)
+    #test_create_block(F.int32)
    pass