[Feature] Make to_heterogeneous(to_homogeneous(hg)) return hg (#2958)

* make to_heterogeneous and to_homogeneous invertible * docstring * oops Co-authored-by: Jinjing Zhou <VoVAllen@users.noreply.github.com>

[Feature] Make to_heterogeneous(to_homogeneous(hg)) return hg (#2958)
* make to_heterogeneous and to_homogeneous invertible * docstring * oops Co-authored-by: Jinjing Zhou <VoVAllen@users.noreply.github.com>
2df4a95f · Quan (Andy) Gan · GitHub · 60426278 · 2df4a95f · 2df4a95f
Unverified Commit 2df4a95f authored Jun 03, 2021 by Quan (Andy) Gan Committed by GitHub Jun 03, 2021
4 changed files
--- a/python/dgl/convert.py
+++ b/python/dgl/convert.py
@@ -631,8 +631,10 @@ def to_heterogeneous(G, ntypes, etypes, ntype_field=NTYPE,

    Notes
    -----
-    The returned node and edge types may not necessarily be in the same order as
-    ``ntypes`` and ``etypes``.
+    * The returned node and edge types may not necessarily be in the same order as
+      ``ntypes`` and ``etypes``.
+    * Calling :func:`~dgl.to_homogeneous` then calling :func:`~dgl.to_heterogeneous` again
+      yields the same result.

    Examples
    --------
@@ -705,7 +707,7 @@ def to_heterogeneous(G, ntypes, etypes, ntype_field=NTYPE,
    # relabel nodes to per-type local IDs
    ntype_count = np.bincount(ntype_ids, minlength=num_ntypes)
    ntype_offset = np.insert(np.cumsum(ntype_count), 0, 0)
-    ntype_ids_sortidx = np.argsort(ntype_ids)
+    ntype_ids_sortidx = np.argsort(ntype_ids, kind='stable')
    ntype_local_ids = np.zeros_like(ntype_ids)
    node_groups = []
    for i in range(num_ntypes):
@@ -848,6 +850,8 @@ def to_homogeneous(G, ndata=None, edata=None, store_type=True, return_count=Fals
      to its memory efficiency.
    * The ``ntype_count`` and ``etype_count`` lists can help speed up some operations.
      See :class:`~dgl.nn.pytorch.conv.RelGraphConv` for such an example.
+    * Calling :func:`~dgl.to_homogeneous` then calling :func:`~dgl.to_heterogeneous` again
+      yields the same result.

    Examples
    --------

--- a/tests/compute/test_heterograph.py
+++ b/tests/compute/test_heterograph.py
@@ -10,6 +10,7 @@ import unittest, pytest
 from dgl import DGLError
 import test_utils
 from test_utils import parametrize_dtype, get_cases
+from utils import assert_is_identical_hetero
 from scipy.sparse import rand

 def create_test_heterograph(idtype):
@@ -1111,6 +1112,14 @@ def test_to_homo2(idtype):
    for i, count in enumerate(etype_count):
        assert count == hg.num_edges(hg.canonical_etypes[i])

+@parametrize_dtype
+def test_invertible_conversion(idtype):
+    # Test whether to_homogeneous and to_heterogeneous are invertible
+    hg = create_test_heterograph(idtype)
+    g = dgl.to_homogeneous(hg)
+    hg2 = dgl.to_heterogeneous(g, hg.ntypes, hg.etypes)
+    assert_is_identical_hetero(hg, hg2, True)
+
 @parametrize_dtype
 def test_metagraph_reachable(idtype):
    g = create_test_heterograph(idtype)

--- a/tests/compute/test_pickle.py
+++ b/tests/compute/test_pickle.py
@@ -11,46 +11,7 @@ import io
 import unittest, pytest
 import test_utils
 from test_utils import parametrize_dtype, get_cases
-
-def _assert_is_identical(g, g2):
-    assert g.is_readonly == g2.is_readonly
-    assert g.number_of_nodes() == g2.number_of_nodes()
-    src, dst = g.all_edges(order='eid')
-    src2, dst2 = g2.all_edges(order='eid')
-    assert F.array_equal(src, src2)
-    assert F.array_equal(dst, dst2)
-
-    assert len(g.ndata) == len(g2.ndata)
-    assert len(g.edata) == len(g2.edata)
-    for k in g.ndata:
-        assert F.allclose(g.ndata[k], g2.ndata[k])
-    for k in g.edata:
-        assert F.allclose(g.edata[k], g2.edata[k])
-
-def _assert_is_identical_hetero(g, g2):
-    assert g.is_readonly == g2.is_readonly
-    assert g.ntypes == g2.ntypes
-    assert g.canonical_etypes == g2.canonical_etypes
-
-    # check if two metagraphs are identical
-    for edges, features in g.metagraph().edges(keys=True).items():
-        assert g2.metagraph().edges(keys=True)[edges] == features
-
-    # check if node ID spaces and feature spaces are equal
-    for ntype in g.ntypes:
-        assert g.number_of_nodes(ntype) == g2.number_of_nodes(ntype)
-        assert len(g.nodes[ntype].data) == len(g2.nodes[ntype].data)
-        for k in g.nodes[ntype].data:
-            assert F.allclose(g.nodes[ntype].data[k], g2.nodes[ntype].data[k])
-
-    # check if edge ID spaces and feature spaces are equal
-    for etype in g.canonical_etypes:
-        src, dst = g.all_edges(etype=etype, order='eid')
-        src2, dst2 = g2.all_edges(etype=etype, order='eid')
-        assert F.array_equal(src, src2)
-        assert F.array_equal(dst, dst2)
-        for k in g.edges[etype].data:
-            assert F.allclose(g.edges[etype].data[k], g2.edges[etype].data[k])
+from utils import assert_is_identical, assert_is_identical_hetero

 def _assert_is_identical_nodeflow(nf1, nf2):
    assert nf1.is_readonly == nf2.is_readonly
@@ -74,13 +35,13 @@ def _assert_is_identical_nodeflow(nf1, nf2):
            assert F.allclose(nf1.blocks[i].data[k], nf2.blocks[i].data[k])

 def _assert_is_identical_batchedgraph(bg1, bg2):
-    _assert_is_identical(bg1, bg2)
+    assert_is_identical(bg1, bg2)
    assert bg1.batch_size == bg2.batch_size
    assert bg1.batch_num_nodes == bg2.batch_num_nodes
    assert bg1.batch_num_edges == bg2.batch_num_edges

 def _assert_is_identical_batchedhetero(bg1, bg2):
-    _assert_is_identical_hetero(bg1, bg2)
+    assert_is_identical_hetero(bg1, bg2)
    for ntype in bg1.ntypes:
        assert bg1.batch_num_nodes(ntype) == bg2.batch_num_nodes(ntype)
    for canonical_etype in bg1.canonical_etypes:

--- a/tests/compute/utils.py
+++ b/tests/compute/utils.py
 import pytest
 import backend as F
+import dgl
+from dgl.base import is_internal_column

 if F._default_context_str == 'cpu':
    parametrize_dtype = pytest.mark.parametrize("idtype", [F.int32, F.int64])
@@ -13,3 +15,58 @@ def check_fail(fn, *args, **kwargs):
        return False
    except:
        return True
+
+def assert_is_identical(g, g2):
+    assert g.is_readonly == g2.is_readonly
+    assert g.number_of_nodes() == g2.number_of_nodes()
+    src, dst = g.all_edges(order='eid')
+    src2, dst2 = g2.all_edges(order='eid')
+    assert F.array_equal(src, src2)
+    assert F.array_equal(dst, dst2)
+
+    assert len(g.ndata) == len(g2.ndata)
+    assert len(g.edata) == len(g2.edata)
+    for k in g.ndata:
+        assert F.allclose(g.ndata[k], g2.ndata[k])
+    for k in g.edata:
+        assert F.allclose(g.edata[k], g2.edata[k])
+
+def assert_is_identical_hetero(g, g2, ignore_internal_data=False):
+    assert g.is_readonly == g2.is_readonly
+    assert g.ntypes == g2.ntypes
+    assert g.canonical_etypes == g2.canonical_etypes
+
+    # check if two metagraphs are identical
+    for edges, features in g.metagraph().edges(keys=True).items():
+        assert g2.metagraph().edges(keys=True)[edges] == features
+
+    # check if node ID spaces and feature spaces are equal
+    for ntype in g.ntypes:
+        assert g.number_of_nodes(ntype) == g2.number_of_nodes(ntype)
+        if ignore_internal_data:
+            for k in list(g.nodes[ntype].data.keys()):
+                if is_internal_column(k):
+                    del g.nodes[ntype].data[k]
+            for k in list(g2.nodes[ntype].data.keys()):
+                if is_internal_column(k):
+                    del g2.nodes[ntype].data[k]
+        assert len(g.nodes[ntype].data) == len(g2.nodes[ntype].data)
+        for k in g.nodes[ntype].data:
+            assert F.allclose(g.nodes[ntype].data[k], g2.nodes[ntype].data[k])
+
+    # check if edge ID spaces and feature spaces are equal
+    for etype in g.canonical_etypes:
+        src, dst = g.all_edges(etype=etype, order='eid')
+        src2, dst2 = g2.all_edges(etype=etype, order='eid')
+        assert F.array_equal(src, src2)
+        assert F.array_equal(dst, dst2)
+        if ignore_internal_data:
+            for k in list(g.edges[etype].data.keys()):
+                if is_internal_column(k):
+                    del g.edges[etype].data[k]
+            for k in list(g2.edges[etype].data.keys()):
+                if is_internal_column(k):
+                    del g2.edges[etype].data[k]
+        assert len(g.edges[etype].data) == len(g2.edges[etype].data)
+        for k in g.edges[etype].data:
+            assert F.allclose(g.edges[etype].data[k], g2.edges[etype].data[k])