[Transform] New ToSimple API (#1789)

* To simple * lint * upd * more test * fix * Fix * Fix * add share edata * doc * Fix * Fix * Fix * Fix * lint * lint * Fix * Fix Co-authored-by: Ubuntu <ubuntu@ip-172-31-51-214.ec2.internal> Co-authored-by: Mufei Li <mufeili1996@gmail.com>

[Transform] New ToSimple API (#1789)
* To simple * lint * upd * more test * fix * Fix * Fix * add share edata * doc * Fix * Fix * Fix * Fix * lint * lint * Fix * Fix Co-authored-by: Ubuntu <ubuntu@ip-172-31-51-214.ec2.internal> Co-authored-by: Mufei Li <mufeili1996@gmail.com>
a5722d02 · xiang song(charlie.song) · GitHub · a1472bcf · a5722d02 · a5722d02
Unverified Commit a5722d02 authored Jul 15, 2020 by xiang song(charlie.song) Committed by GitHub Jul 15, 2020
Hide whitespace changes
Inline Side-by-side

Showing with 226 additions and 41 deletions

python/dgl/transform.py python/dgl/transform.py +163 -36

tests/compute/test_transform.py tests/compute/test_transform.py +63 -5

No files found.
--- a/python/dgl/transform.py
+++ b/python/dgl/transform.py
@@ -1393,56 +1393,159 @@ def out_subgraph(g, nodes):
        ret.edges[etype].data[EID] = induced_edges[i].tousertensor()
    return ret
-def to_simple(g, return_counts='count', writeback_mapping=None):
+def to_simple(g, return_counts='count', writeback_mapping=False, copy_ndata=True, copy_edata=False):
-    """Convert a heterogeneous multigraph to a heterogeneous simple graph, coalescing
+    r"""Convert a graph to a simple graph without duplicate edges.
-    duplicate edges into one.
-    This function does not preserve node and edge features.
+    For a heterograph with multiple edge types, we
+    treat edges corresponding
+    to each type as a separate graph and convert each
+    of them to a simple graph.
+    When writeback_mapping=True, an extra mapping is returned.
+    For the edges in the original graph,
+    a writeback mapping is a tensor recording their new
+    ids in the simple graph. If the graph has
+    only one edge type, a single tensor is returned.
+    If the graph has multiple edge types, a dictionary
+    of tensor is returned using canonical edge types
+    as the key.
+    Given a :class:`dgl.DGLGraph` object, we return
+    another :class:`dgl.DGLGraph` object representing the
+    simple graph corresponding to it.
-    TODO(xiangsx): Don't save writeback_mapping into g, but put it into return value.
    Parameters
    ----------
-    g : DGLHeteroGraph
+    g : DGLGraph
-        The heterogeneous graph
+        The input graph.
    return_counts : str, optional
-        If given, the returned graph would have a column with the same name that stores
+        If given, the count of each edge in the original graph
-        the number of duplicated edges from the original graph.
+        will be stored as edge features under the name
-    writeback_mapping : str, optional
+        eturn_counts.
-        If given, the mapping from the edge IDs of original graph to those of the returned
+        (Default: "count")
-        graph would be written into edge feature with this name in the original graph for
+    writeback_mapping: bool, optional
-        each edge type.
+        If True, a write back mapping is returned for each edge
+        type subgraph. If False, only the simple graph is returned.
+        (Default: False)
+    copy_ndata: bool, optional
+        If True, the node features of the simple graph are copied
+        from the original graph. If False, the simple
+        graph will not have any node features.
+        (Default: True)
+    copy_edata: bool, optional
+        If True, the edge features of the simple graph are copied
+        from the original graph. If there exists duplicate edges between
+        two nodes (u, v), the feature of the edge is randomly selected
+        from one of the duplicate edges.
+        If False, the simple graph will not have any edge features.
+        (Default: False)
    Returns
    -------
-    DGLHeteroGraph
+    DGLGraph
-        The new heterogeneous simple graph.
+        A simple graph.
+    tensor or dict of tensor
+        If writeback_mapping is True, the writeback
+        mapping is returned. If the graph has only
+        one edge type, a tensor is returned. If the
+        graph has multiple edge types, a dictionary
+        of tensor is return.
+    If ``copy_ndata`` is ``True``, same tensors will be used for
+    the features of the original graph and the to_simpled graph. As a result, users
+    should avoid performing in-place operations on the features of the to_simpled
+    graph, which will corrupt the features of the original graph as well. For
+    concrete examples, refer to the ``Examples`` section below.
    Examples
    --------
-    Consider the following graph
+    **Homographs or Heterographs with A Single Edge Type**
-    >>> g = dgl.graph(([0, 1, 2, 1, 1, 1], [1, 3, 2, 3, 4, 4]))
-    >>> sg = dgl.to_simple(g, return_counts='weights', writeback_mapping='new_eid')
+    Create a graph for demonstrating to_simple API.
+    In the original graph, there are multiple edges between 1 and 2.
+    >>> import dgl
+    >>> import torch as th
+    >>> g = dgl.graph((th.tensor([0, 1, 2, 1]), th.tensor([1, 2, 0, 2])))
+    >>> g.ndata['h'] = th.tensor([[0.], [1.], [2.]])
+    >>> g.edata['h'] = th.tensor([[3.], [4.], [5.], [6.]])
+    Convert the graph to a simple graph. The return counts is
+    stored in the edge feature 'cnt' and the writeback mapping
+    is returned in a tensor.
+    >>> sg, wm = dgl.to_simple(g, return_counts='cnt', writeback_mapping=True)
+    >>> sg.ndata['h']
+    tensor([[0.],
+            [1.],
+            [2.]])
+    >>> u, v, eid = sg.edges(form='all')
+    >>> u
+    tensor([0, 1, 2])
+    >>> v
+    tensor([1, 2, 0])
+    >>> eid
+    tensor([0, 1, 2])
+    >>> sg.edata['cnt']
+    tensor([1, 2, 1])
+    >>> wm
+    tensor([0, 1, 2, 1])
+    >>> 'h' in g.edata
+    False
+    **In-place operations on features of one graph will be reflected on features of
+    the simple graph, which is dangerous. Out-place operations will not be reflected.**
-    The returned graph would have duplicate edges connecting (1, 3) and (1, 4) removed:
+    >>> sg.ndata['h'] += 1
-    >>> sg.all_edges(form='uv', order='eid')
+    >>> g.ndata['h']
-    (tensor([0, 1, 1, 2]), tensor([1, 3, 4, 2]))
+    tensor([[1.],
+            [2.],
+            [3.]])
+    >>> g.ndata['h'] += 1
+    >>> sg.ndata['h']
+    tensor([[2.],
+            [3.],
+            [4.]])
+    >>> sg.ndata['h2'] = th.ones(3, 1)
+    >>> 'h2' in g.ndata
+    False
-    If ``return_counts`` is set, the returned graph will also return how many edges
+    **Heterographs with Multiple Edge Types**
-    in the original graph are connecting the endpoints of the edges in the new graph:
-    >>> sg.edata['weights']
-    tensor([1, 2, 2, 1])
-    This essentially reads that one edge is connecting (0, 1) in ``g``, whereas 2 edges
+    >>> g = dgl.heterograph({
-    are connecting (1, 3) in ``g``, etc.
+    >>>     ('user', 'wins', 'user'): (th.tensor([0, 2, 0, 2, 2]), th.tensor([1, 1, 2, 1, 0])),
+    >>>     ('user', 'plays', 'game'): (th.tensor([1, 2, 1]), th.tensor([2, 1, 1]))
+    >>> })
+    >>> g.nodes['game'].data['hv'] = th.ones(3, 1)
+    >>> g.edges['plays'].data['he'] = th.zeros(3, 1)
-    One can also retrieve the mapping from the edges in the original graph to edges in
+    The to_simple operation is applied to the subgraph
-    the new graph by setting ``writeback_mapping`` and running
+    corresponding to ('user', 'wins', 'user') and the
-    >>> g.edata['new_eid']
+    subgraph corresponding to ('user', 'plays', 'game').
-    tensor([0, 1, 3, 1, 2, 2])
+    The return counts is stored in the default edge feature
+    'count'.
-    This tells us that the first edge in ``g`` is mapped to the first edge in ``sg``, and
+    >>> sg, wm = dgl.to_simple(g, copy_ndata=False, writeback_mapping=True)
-    the second and the fourth edge are mapped to the second edge in ``sg``, etc.
+    >>> sg
+    Graph(num_nodes={'game': 3, 'user': 3},
+          num_edges={('user', 'wins', 'user'): 4, ('game', 'plays', 'user'): 3},
+          metagraph=[('user', 'user'), ('game', 'user')])
+    >>> sg.edges(etype='wins')
+    (tensor([0, 2, 0, 2]), tensor([1, 1, 2, 0]))
+    >>> wm[('user', 'wins', 'user')]
+    tensor([0, 1, 2, 1, 3])
+    >>> sg.edges(etype='plays')
+    (tensor([2, 1, 1]), tensor([1, 2, 1]))
+    >>> wm[('user', 'plays', 'game')]
+    tensor([0, 1, 2])
+    >>> 'hv' in sg.nodes['game'].data
+    False
+    >>> 'he' in sg.edges['plays'].data
+    False
+    >>> sg.edata['count']
+    {('user', 'wins', 'user'): tensor([1, 2, 1, 1])
+     ('user', 'plays', 'game'): tensor([1, 1, 1])}
    """
    simple_graph_index, counts, edge_maps = _CAPI_DGLToSimpleHetero(g._graph)
    simple_graph = DGLHeteroGraph(simple_graph_index, g.ntypes, g.etypes)
@@ -1453,12 +1556,36 @@ def to_simple(g, return_counts='count', writeback_mapping=None):
        for count, canonical_etype in zip(counts, g.canonical_etypes):
            simple_graph.edges[canonical_etype].data[return_counts] = count
-    if writeback_mapping is not None:
+    if copy_ndata:
-        for edge_map, canonical_etype in zip(edge_maps, g.canonical_etypes):
+        for ntype in g.ntypes:
-            g.edges[canonical_etype].data[writeback_mapping] = edge_map
+            for key in g.nodes[ntype].data:
+                simple_graph.nodes[ntype].data[key] = g.nodes[ntype].data[key]
+    if copy_edata:
+        for i, c_etype in enumerate(g.canonical_etypes):
+            for key in g.edges[c_etype].data:
+                feat_idx = F.asnumpy(edge_maps[i])
+                _, indices = np.unique(feat_idx, return_index=True)
+                simple_graph.edges[c_etype].data[key] = \
+                    F.gather_row(g.edges[c_etype].data[key],
+                                 F.copy_to(F.tensor(indices),
+                                           F.context(g.edges[c_etype].data[key])))
+    if writeback_mapping:
+        # single edge type
+        if len(edge_maps) == 1:
+            return simple_graph, edge_maps[0]
+        # multiple edge type
+        else:
+            wb_map = {}
+            for edge_map, canonical_etype in zip(edge_maps, g.canonical_etypes):
+                wb_map[canonical_etype] = edge_map
+            return simple_graph, wb_map
    return simple_graph
+DGLHeteroGraph.to_simple = to_simple
 def as_heterograph(g, ntype='_U', etype='_E'):
    """Convert a DGLGraph to a DGLHeteroGraph with one node and edge type.

--- a/tests/compute/test_transform.py
+++ b/tests/compute/test_transform.py
@@ -680,19 +680,59 @@ def test_compact(index_dtype):
    _check(g3, new_g3, induced_nodes)
    _check(g4, new_g4, induced_nodes)
+@unittest.skipIf(F._default_context_str == 'gpu', reason="GPU to simple not implemented")
 @parametrize_dtype
 def test_to_simple(index_dtype):
+    # homogeneous graph
+    g = dgl.graph((F.tensor([0, 1, 2, 1]), F.tensor([1, 2, 0, 2])))
+    g.ndata['h'] = F.tensor([[0.], [1.], [2.]])
+    g.edata['h'] = F.tensor([[3.], [4.], [5.], [6.]])
+    sg, wb = dgl.to_simple(g, writeback_mapping=True)
+    u, v = g.all_edges(form='uv', order='eid')
+    u = F.asnumpy(u).tolist()
+    v = F.asnumpy(v).tolist()
+    uv = list(zip(u, v))
+    eid_map = F.asnumpy(wb)
+    su, sv = sg.all_edges(form='uv', order='eid')
+    su = F.asnumpy(su).tolist()
+    sv = F.asnumpy(sv).tolist()
+    suv = list(zip(su, sv))
+    sc = F.asnumpy(sg.edata['count'])
+    assert set(uv) == set(suv)
+    for i, e in enumerate(suv):
+        assert sc[i] == sum(e == _e for _e in uv)
+    for i, e in enumerate(uv):
+        assert eid_map[i] == suv.index(e)
+    # shared ndata
+    assert F.array_equal(sg.ndata['h'], g.ndata['h'])
+    assert 'h' not in sg.edata
+    # new ndata to sg
+    sg.ndata['hh'] = F.tensor([[0.], [1.], [2.]])
+    assert 'hh' not in g.ndata
+    sg = dgl.to_simple(g, writeback_mapping=False, copy_ndata=False)
+    assert 'h' not in sg.ndata
+    assert 'h' not in sg.edata
+    # heterogeneous graph
    g = dgl.heterograph({
-        ('user', 'follow', 'user'): [(0, 1), (1, 3), (2, 2), (1, 3), (1, 4), (1, 4)],
+        ('user', 'follow', 'user'): ([0, 1, 2, 1, 1, 1],
-        ('user', 'plays', 'game'): [(3, 5), (2, 3), (1, 4), (1, 4), (3, 5), (2, 3), (2, 3)]}, index_dtype=index_dtype)
+                                     [1, 3, 2, 3, 4, 4]),
-    sg = dgl.to_simple(g, return_counts='weights', writeback_mapping='new_eid')
+        ('user', 'plays', 'game'): ([3, 2, 1, 1, 3, 2, 2], [5, 3, 4, 4, 5, 3, 3])},
+        index_dtype=index_dtype)
+    g.nodes['user'].data['h'] = F.tensor([0, 1, 2, 3, 4])
+    g.nodes['user'].data['hh'] = F.tensor([0, 1, 2, 3, 4])
+    g.edges['follow'].data['h'] = F.tensor([0, 1, 2, 3, 4, 5])
+    sg, wb = dgl.to_simple(g, return_counts='weights', writeback_mapping=True, copy_edata=True)
+    g.nodes['game'].data['h'] = F.tensor([0, 1, 2, 3, 4, 5])
    for etype in g.canonical_etypes:
        u, v = g.all_edges(form='uv', order='eid', etype=etype)
        u = F.asnumpy(u).tolist()
        v = F.asnumpy(v).tolist()
        uv = list(zip(u, v))
-        eid_map = F.asnumpy(g.edges[etype].data['new_eid'])
+        eid_map = F.asnumpy(wb[etype])
        su, sv = sg.all_edges(form='uv', order='eid', etype=etype)
        su = F.asnumpy(su).tolist()
@@ -705,6 +745,24 @@ def test_to_simple(index_dtype):
            assert sw[i] == sum(e == _e for _e in uv)
        for i, e in enumerate(uv):
            assert eid_map[i] == suv.index(e)
+    # shared ndata
+    assert F.array_equal(sg.nodes['user'].data['h'], g.nodes['user'].data['h'])
+    assert F.array_equal(sg.nodes['user'].data['hh'], g.nodes['user'].data['hh'])
+    assert 'h' not in sg.nodes['game'].data
+    # new ndata to sg
+    sg.nodes['user'].data['hhh'] = F.tensor([0, 1, 2, 3, 4])
+    assert 'hhh' not in g.nodes['user'].data
+    # share edata
+    feat_idx = F.asnumpy(wb[('user', 'follow', 'user')])
+    _, indices = np.unique(feat_idx, return_index=True)
+    assert np.array_equal(F.asnumpy(sg.edges['follow'].data['h']),
+                          F.asnumpy(g.edges['follow'].data['h'])[indices])
+    sg = dgl.to_simple(g, writeback_mapping=False, copy_ndata=False)
+    for ntype in g.ntypes:
+        assert g.number_of_nodes(ntype) == sg.number_of_nodes(ntype)
+    assert 'h' not in sg.nodes['user'].data
+    assert 'hh' not in sg.nodes['user'].data
 @unittest.skipIf(F._default_context_str == 'gpu', reason="GPU compaction not implemented")
 @parametrize_dtype
@@ -877,7 +935,7 @@ if __name__ == '__main__':
    # test_add_self_loop()
    # test_partition_with_halo()
    # test_metis_partition()
-    test_hetero_linegraph('int32')
+    # test_hetero_linegraph('int32')
    # test_compact()
    test_to_simple("int32")
    # test_in_subgraph("int32")