Unverified Commit 2288098b authored by Quan (Andy) Gan's avatar Quan (Andy) Gan Committed by GitHub
Browse files

[Feature] Converting a block to a graph and allow block construction (#2555)

* first commit

* more commit

* fix and docstring

* add to api reference

* address comments

* Update convert.py
parent 7ff2662b
...@@ -26,6 +26,8 @@ Operators for constructing :class:`DGLGraph` from raw data formats. ...@@ -26,6 +26,8 @@ Operators for constructing :class:`DGLGraph` from raw data formats.
rand_bipartite rand_bipartite
knn_graph knn_graph
segmented_knn_graph segmented_knn_graph
create_block
block_to_graph
.. _api-subgraph-extraction: .. _api-subgraph-extraction:
......
"""Module for converting graph from/to other object.""" """Module for converting graph from/to other object."""
from collections import defaultdict from collections import defaultdict
from collections.abc import Mapping
from scipy.sparse import spmatrix from scipy.sparse import spmatrix
import numpy as np import numpy as np
import networkx as nx import networkx as nx
from . import backend as F from . import backend as F
from . import heterograph_index from . import heterograph_index
from .heterograph import DGLHeteroGraph, combine_frames from .heterograph import DGLHeteroGraph, combine_frames, DGLBlock
from . import graph_index from . import graph_index
from . import utils from . import utils
from .base import NTYPE, ETYPE, NID, EID, DGLError, dgl_warning from .base import NTYPE, ETYPE, NID, EID, DGLError, dgl_warning
...@@ -17,6 +18,8 @@ __all__ = [ ...@@ -17,6 +18,8 @@ __all__ = [
'hetero_from_relations', 'hetero_from_relations',
'hetero_from_shared_memory', 'hetero_from_shared_memory',
'heterograph', 'heterograph',
'create_block',
'block_to_graph',
'to_heterogeneous', 'to_heterogeneous',
'to_hetero', 'to_hetero',
'to_homogeneous', 'to_homogeneous',
...@@ -321,7 +324,7 @@ def heterograph(data_dict, ...@@ -321,7 +324,7 @@ def heterograph(data_dict,
if num_nodes_dict[dty] < vrange: if num_nodes_dict[dty] < vrange:
raise DGLError('The given number of nodes of node type {} must be larger than' raise DGLError('The given number of nodes of node type {} must be larger than'
' the max ID in the data, but got {} and {}.'.format( ' the max ID in the data, but got {} and {}.'.format(
sty, num_nodes_dict[dty], vrange - 1)) dty, num_nodes_dict[dty], vrange - 1))
# Create the graph # Create the graph
# Sort the ntypes and relation tuples to have a deterministic order for the same set # Sort the ntypes and relation tuples to have a deterministic order for the same set
...@@ -354,6 +357,237 @@ def heterograph(data_dict, ...@@ -354,6 +357,237 @@ def heterograph(data_dict,
return retg.to(device) return retg.to(device)
def create_block(data_dict, num_src_nodes=None, num_dst_nodes=None, idtype=None, device=None):
"""Create a :class:`DGLBlock` object.
Parameters
----------
data_dict : graph data
The dictionary data for constructing a block. The keys are in the form of
string triplets (src_type, edge_type, dst_type), specifying the input node type,
edge type, and output node type. The values are graph data in the form of
:math:`(U, V)`, where :math:`(U[i], V[i])` forms the edge with ID :math:`i`.
The allowed graph data formats are:
- ``(Tensor, Tensor)``: Each tensor must be a 1D tensor containing node IDs. DGL calls
this format "tuple of node-tensors". The tensors should have the same data type,
which must be either int32 or int64. They should also have the same device context
(see below the descriptions of :attr:`idtype` and :attr:`device`).
- ``(iterable[int], iterable[int])``: Similar to the tuple of node-tensors
format, but stores node IDs in two sequences (e.g. list, tuple, numpy.ndarray).
If you would like to create a block with a single input node type, a single output
node type, and a single edge type, then you can pass in the graph data directly
without wrapping it as a dictionary.
num_src_nodes : dict[str, int] or int, optional
The number of nodes for each input node type, which is a dictionary mapping a node type
:math:`T` to the number of :math:`T`-typed input nodes.
If not given for a node type :math:`T`, DGL finds the largest ID appearing in *every*
graph data whose input node type is :math:`T`, and sets the number of nodes to
be that ID plus one. If given and the value is no greater than the largest ID for some
input node type, DGL will raise an error. By default, DGL infers the number of nodes for
all input node types.
If you would like to create a block with a single input node type, a single output
node type, and a single edge type, then you can pass in an integer to directly
represent the number of input nodes.
num_dst_nodes : dict[str, int] or int, optional
The number of nodes for each output node type, which is a dictionary mapping a node type
:math:`T` to the number of :math:`T`-typed output nodes.
If not given for a node type :math:`T`, DGL finds the largest ID appearing in *every*
graph data whose output node type is :math:`T`, and sets the number of nodes to
be that ID plus one. If given and the value is no greater than the largest ID for some
output node type, DGL will raise an error. By default, DGL infers the number of nodes for
all output node types.
If you would like to create a block with a single output node type, a single output
node type, and a single edge type, then you can pass in an integer to directly
represent the number of output nodes.
idtype : int32 or int64, optional
The data type for storing the structure-related graph information such as node and
edge IDs. It should be a framework-specific data type object (e.g., ``torch.int32``).
If ``None`` (default), DGL infers the ID type from the :attr:`data_dict` argument.
device : device context, optional
The device of the returned graph, which should be a framework-specific device object
(e.g., ``torch.device``). If ``None`` (default), DGL uses the device of the tensors of
the :attr:`data` argument. If :attr:`data` is not a tuple of node-tensors, the
returned graph is on CPU. If the specified :attr:`device` differs from that of the
provided tensors, it casts the given tensors to the specified device first.
Returns
-------
DGLBlock
The created block.
Notes
-----
1. If the :attr:`idtype` argument is not given then:
- in the case of the tuple of node-tensor format, DGL uses
the data type of the given ID tensors.
- in the case of the tuple of sequence format, DGL uses int64.
Once the graph has been created, you can change the data type by using
:func:`dgl.DGLGraph.long` or :func:`dgl.DGLGraph.int`.
If the specified :attr:`idtype` argument differs from the data type of the provided
tensors, it casts the given tensors to the specified data type first.
2. The most efficient construction approach is to provide a tuple of node tensors without
specifying :attr:`idtype` and :attr:`device`. This is because the returned graph shares
the storage with the input node-tensors in this case.
3. DGL internally maintains multiple copies of the graph structure in different sparse
formats and chooses the most efficient one depending on the computation invoked.
If memory usage becomes an issue in the case of large graphs, use
:func:`dgl.DGLGraph.formats` to restrict the allowed formats.
4. DGL internally decides a deterministic order for the same set of node types and canonical
edge types, which does not necessarily follow the order in :attr:`data_dict`.
Examples
--------
The following example uses PyTorch backend.
>>> import dgl
>>> block = dgl.create_block(([0, 1, 2], [1, 2, 3]), num_src_nodes=3, num_dst_nodes=4)
>>> block
Block(num_src_nodes=3, num_dst_nodes=4, num_edges=3)
>>> block = dgl.create_block({
... ('A', 'AB', 'B'): ([1, 2, 3], [2, 1, 0]),
... ('B', 'BA', 'A'): ([2, 1], [2, 3])},
... num_src_nodes={'A': 6, 'B': 5},
... num_dst_nodes={'A': 4, 'B': 3})
>>> block
Block(num_src_nodes={'A': 6, 'B': 5},
num_dst_nodes={'A': 4, 'B': 3},
num_edges={('A', 'AB', 'B'): 3, ('B', 'BA', 'A'): 2},
metagraph=[('A', 'B', 'AB'), ('B', 'A', 'BA')])
See also
--------
to_block
"""
need_infer = num_src_nodes is None and num_dst_nodes is None
if not isinstance(data_dict, Mapping):
data_dict = {('_N', '_E', '_N'): data_dict}
if not need_infer:
assert isinstance(num_src_nodes, int), \
"num_src_nodes must be a pair of integers if data_dict is not a dict"
assert isinstance(num_dst_nodes, int), \
"num_dst_nodes must be a pair of integers if data_dict is not a dict"
num_src_nodes = {'_N': num_src_nodes}
num_dst_nodes = {'_N': num_dst_nodes}
else:
if not need_infer:
assert isinstance(num_src_nodes, Mapping), \
"num_src_nodes must be a dict if data_dict is a dict"
assert isinstance(num_dst_nodes, Mapping), \
"num_dst_nodes must be a dict if data_dict is a dict"
if need_infer:
num_src_nodes = defaultdict(int)
num_dst_nodes = defaultdict(int)
# Convert all data to node tensors first
node_tensor_dict = {}
for (sty, ety, dty), data in data_dict.items():
u, v, urange, vrange = utils.graphdata2tensors(data, idtype, bipartite=True)
node_tensor_dict[(sty, ety, dty)] = (u, v)
if need_infer:
num_src_nodes[sty] = max(num_src_nodes[sty], urange)
num_dst_nodes[dty] = max(num_dst_nodes[dty], vrange)
else: # sanity check
if num_src_nodes[sty] < urange:
raise DGLError('The given number of nodes of input node type {} must be larger'
' than the max ID in the data, but got {} and {}.'.format(
sty, num_src_nodes[sty], urange - 1))
if num_dst_nodes[dty] < vrange:
raise DGLError('The given number of nodes of output node type {} must be larger'
' than the max ID in the data, but got {} and {}.'.format(
dty, num_dst_nodes[dty], vrange - 1))
# Create the graph
# Sort the ntypes and relation tuples to have a deterministic order for the same set
# of type names.
srctypes = list(sorted(num_src_nodes.keys()))
dsttypes = list(sorted(num_dst_nodes.keys()))
relations = list(sorted(node_tensor_dict.keys()))
num_nodes_per_type = utils.toindex(
[num_src_nodes[ntype] for ntype in srctypes] +
[num_dst_nodes[ntype] for ntype in dsttypes], "int64")
srctype_dict = {ntype: i for i, ntype in enumerate(srctypes)}
dsttype_dict = {ntype: i + len(srctypes) for i, ntype in enumerate(dsttypes)}
meta_edges_src = []
meta_edges_dst = []
etypes = []
rel_graphs = []
for srctype, etype, dsttype in relations:
meta_edges_src.append(srctype_dict[srctype])
meta_edges_dst.append(dsttype_dict[dsttype])
etypes.append(etype)
src, dst = node_tensor_dict[(srctype, etype, dsttype)]
g = create_from_edges(src, dst, 'SRC/' + srctype, etype, 'DST/' + dsttype,
num_src_nodes[srctype], num_dst_nodes[dsttype])
rel_graphs.append(g)
# metagraph is DGLGraph, currently still using int64 as index dtype
metagraph = graph_index.from_coo(
len(srctypes) + len(dsttypes), meta_edges_src, meta_edges_dst, True)
# create graph index
hgidx = heterograph_index.create_heterograph_from_relations(
metagraph, [rgrh._graph for rgrh in rel_graphs], num_nodes_per_type)
retg = DGLBlock(hgidx, (srctypes, dsttypes), etypes)
return retg.to(device)
def block_to_graph(block):
"""Convert a :class:`DGLBlock` object to a :class:`DGLGraph`.
DGL will rename all the input node types by suffixing with ``_src``, and
all the output node types by suffixing with ``_dst``.
Features on the returned graph will be preserved.
Parameters
----------
block : DGLBlock
The block.
Returns
-------
DGLGraph
The graph.
Examples
--------
>>> block = dgl.create_block({
... ('A', 'AB', 'B'): ([1, 2, 3], [2, 1, 0]),
... ('B', 'BA', 'A'): ([2, 1], [2, 3])})
>>> g = dgl.block_to_graph(block)
>>> g
Graph(num_nodes={'A_src': 4, 'B_src': 3, 'A_dst': 4, 'B_dst': 3},
num_edges={('A_src', 'AB', 'B_dst'): 3, ('B_src', 'BA', 'A_dst'): 2},
metagraph=[('A_src', 'B_dst', 'AB'), ('B_src', 'A_dst', 'BA')])
"""
new_types = [ntype + '_src' for ntype in block.srctypes] + \
[ntype + '_dst' for ntype in block.dsttypes]
retg = DGLHeteroGraph(block._graph, new_types, block.etypes)
for srctype in block.srctypes:
retg.nodes[srctype + '_src'].data.update(block.srcnodes[srctype].data)
for dsttype in block.dsttypes:
retg.nodes[dsttype + '_dst'].data.update(block.dstnodes[dsttype].data)
for srctype, etype, dsttype in block.canonical_etypes:
retg.edges[srctype + '_src', etype, dsttype + '_dst'].data.update(
block.edges[srctype, etype, dsttype].data)
return retg
def to_heterogeneous(G, ntypes, etypes, ntype_field=NTYPE, def to_heterogeneous(G, ntypes, etypes, ntype_field=NTYPE,
etype_field=ETYPE, metagraph=None): etype_field=ETYPE, metagraph=None):
"""Convert a homogeneous graph to a heterogeneous graph and return. """Convert a homogeneous graph to a heterogeneous graph and return.
......
...@@ -1591,10 +1591,10 @@ def to_block(g, dst_nodes=None, include_dst_in_src=True): ...@@ -1591,10 +1591,10 @@ def to_block(g, dst_nodes=None, include_dst_in_src=True):
``etype`` connecting from node ID ``u`` of type ``utype`` in the input side to node ``etype`` connecting from node ID ``u`` of type ``utype`` in the input side to node
ID ``v`` of type ``vtype`` in the output side. ID ``v`` of type ``vtype`` in the output side.
The output nodes of the block will only contain the nodes that have at least one For blocks returned by :func:`to_block`, the output nodes of the block will only
inbound edge of any type. The input nodes of the block will only contain the nodes contain the nodes that have at least one inbound edge of any type. The input nodes
that appear in the output nodes, as well as the nodes that have at least one outbound of the block will only contain the nodes that appear in the output nodes, as well
edge connecting to one of the output nodes. as the nodes that have at least one outbound edge connecting to one of the output nodes.
If the :attr:`dst_nodes` argument is not None, it specifies the output nodes instead. If the :attr:`dst_nodes` argument is not None, it specifies the output nodes instead.
...@@ -1637,6 +1637,8 @@ def to_block(g, dst_nodes=None, include_dst_in_src=True): ...@@ -1637,6 +1637,8 @@ def to_block(g, dst_nodes=None, include_dst_in_src=True):
:ref:`guide-minibatch` for a more thorough discussion about the methodology :ref:`guide-minibatch` for a more thorough discussion about the methodology
of stochastic training. of stochastic training.
See also :func:`create_block` for more flexible construction of blocks.
Examples Examples
-------- --------
Converting a homogeneous graph to a block as described above: Converting a homogeneous graph to a block as described above:
...@@ -1705,6 +1707,10 @@ def to_block(g, dst_nodes=None, include_dst_in_src=True): ...@@ -1705,6 +1707,10 @@ def to_block(g, dst_nodes=None, include_dst_in_src=True):
>>> block.srcnodes['A'].data[dgl.NID] >>> block.srcnodes['A'].data[dgl.NID]
tensor([2, 1]) tensor([2, 1])
See also
--------
create_block
""" """
assert g.device == F.cpu(), 'the graph must be on CPU' assert g.device == F.cpu(), 'the graph must be on CPU'
......
...@@ -2503,6 +2503,117 @@ def test_frame_device(idtype): ...@@ -2503,6 +2503,117 @@ def test_frame_device(idtype):
assert F.context(ng._node_frames[0]._columns['hh'].storage) == F.ctx() assert F.context(ng._node_frames[0]._columns['hh'].storage) == F.ctx()
assert F.context(ng._edge_frames[0]._columns['h'].storage) == F.cpu() assert F.context(ng._edge_frames[0]._columns['h'].storage) == F.cpu()
@parametrize_dtype
def test_create_block(idtype):
block = dgl.create_block(([0, 1, 2], [1, 2, 3]), idtype=idtype, device=F.ctx())
assert block.num_src_nodes() == 3
assert block.num_dst_nodes() == 4
assert block.num_edges() == 3
block = dgl.create_block(([], []), idtype=idtype, device=F.ctx())
assert block.num_src_nodes() == 0
assert block.num_dst_nodes() == 0
assert block.num_edges() == 0
block = dgl.create_block(([], []), 3, 4, idtype=idtype, device=F.ctx())
assert block.num_src_nodes() == 3
assert block.num_dst_nodes() == 4
assert block.num_edges() == 0
block = dgl.create_block(([0, 1, 2], [1, 2, 3]), 4, 5, idtype=idtype, device=F.ctx())
assert block.num_src_nodes() == 4
assert block.num_dst_nodes() == 5
assert block.num_edges() == 3
sx = F.randn((4, 5))
dx = F.randn((5, 6))
ex = F.randn((3, 4))
block.srcdata['x'] = sx
block.dstdata['x'] = dx
block.edata['x'] = ex
g = dgl.block_to_graph(block)
assert g.num_src_nodes() == 4
assert g.num_dst_nodes() == 5
assert g.num_edges() == 3
assert g.srcdata['x'] is sx
assert g.dstdata['x'] is dx
assert g.edata['x'] is ex
block = dgl.create_block({
('A', 'AB', 'B'): ([1, 2, 3], [2, 1, 0]),
('B', 'BA', 'A'): ([2, 3], [3, 4])},
idtype=idtype, device=F.ctx())
assert block.num_src_nodes('A') == 4
assert block.num_src_nodes('B') == 4
assert block.num_dst_nodes('B') == 3
assert block.num_dst_nodes('A') == 5
assert block.num_edges('AB') == 3
assert block.num_edges('BA') == 2
block = dgl.create_block({
('A', 'AB', 'B'): ([], []),
('B', 'BA', 'A'): ([], [])},
idtype=idtype, device=F.ctx())
assert block.num_src_nodes('A') == 0
assert block.num_src_nodes('B') == 0
assert block.num_dst_nodes('B') == 0
assert block.num_dst_nodes('A') == 0
assert block.num_edges('AB') == 0
assert block.num_edges('BA') == 0
block = dgl.create_block({
('A', 'AB', 'B'): ([], []),
('B', 'BA', 'A'): ([], [])},
num_src_nodes={'A': 5, 'B': 5},
num_dst_nodes={'A': 6, 'B': 4},
idtype=idtype, device=F.ctx())
assert block.num_src_nodes('A') == 5
assert block.num_src_nodes('B') == 5
assert block.num_dst_nodes('B') == 4
assert block.num_dst_nodes('A') == 6
assert block.num_edges('AB') == 0
assert block.num_edges('BA') == 0
block = dgl.create_block({
('A', 'AB', 'B'): ([1, 2, 3], [2, 1, 0]),
('B', 'BA', 'A'): ([2, 3], [3, 4])},
num_src_nodes={'A': 5, 'B': 5},
num_dst_nodes={'A': 6, 'B': 4},
idtype=idtype, device=F.ctx())
assert block.num_src_nodes('A') == 5
assert block.num_src_nodes('B') == 5
assert block.num_dst_nodes('B') == 4
assert block.num_dst_nodes('A') == 6
assert block.num_edges(('A', 'AB', 'B')) == 3
assert block.num_edges(('B', 'BA', 'A')) == 2
sax = F.randn((5, 3))
sbx = F.randn((5, 4))
dax = F.randn((6, 5))
dbx = F.randn((4, 6))
eabx = F.randn((3, 7))
ebax = F.randn((2, 8))
block.srcnodes['A'].data['x'] = sax
block.srcnodes['B'].data['x'] = sbx
block.dstnodes['A'].data['x'] = dax
block.dstnodes['B'].data['x'] = dbx
block.edges['AB'].data['x'] = eabx
block.edges['BA'].data['x'] = ebax
hg = dgl.block_to_graph(block)
assert hg.num_nodes('A_src') == 5
assert hg.num_nodes('B_src') == 5
assert hg.num_nodes('A_dst') == 6
assert hg.num_nodes('B_dst') == 4
assert hg.num_edges(('A_src', 'AB', 'B_dst')) == 3
assert hg.num_edges(('B_src', 'BA', 'A_dst')) == 2
assert hg.nodes['A_src'].data['x'] is sax
assert hg.nodes['B_src'].data['x'] is sbx
assert hg.nodes['A_dst'].data['x'] is dax
assert hg.nodes['B_dst'].data['x'] is dbx
assert hg.edges['AB'].data['x'] is eabx
assert hg.edges['BA'].data['x'] is ebax
if __name__ == '__main__': if __name__ == '__main__':
...@@ -2541,4 +2652,5 @@ if __name__ == '__main__': ...@@ -2541,4 +2652,5 @@ if __name__ == '__main__':
#test_frame(F.int32) #test_frame(F.int32)
#test_frame_device(F.int32) #test_frame_device(F.int32)
#test_empty_query(F.int32) #test_empty_query(F.int32)
#test_create_block(F.int32)
pass pass
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment