[Feature] PinSAGE sampler (#1249)

* [WIP] PinSAGE operators * moved the edge remapping mess into C * some docstrings * lint * lint x2 * lint x3 * skip gpu test on topk * extend pinsage to any metapath * lint x4 * addresses #1265 * add always_preserve (fixes #1266) and fix a silly bug * disable gpu test on compaction * lint * fix a horrible bug and add more tests * lint * addresses comments * lint * bugfix * addresses comments Co-authored-by: Minjie Wang <minjie.wang@nyu.edu>

[Feature] PinSAGE sampler (#1249)
* [WIP] PinSAGE operators * moved the edge remapping mess into C * some docstrings * lint * lint x2 * lint x3 * skip gpu test on topk * extend pinsage to any metapath * lint x4 * addresses #1265 * add always_preserve (fixes #1266) and fix a silly bug * disable gpu test on compaction * lint * fix a horrible bug and add more tests * lint * addresses comments * lint * bugfix * addresses comments Co-authored-by: Minjie Wang <minjie.wang@nyu.edu>
c3a33407 · Quan (Andy) Gan · GitHub · 87bca129 · c3a33407 · c3a33407
Unverified Commit c3a33407 authored Feb 28, 2020 by Quan (Andy) Gan Committed by GitHub Feb 28, 2020
4 changed files
--- a/src/graph/transform/to_simple.cc
+++ b/src/graph/transform/to_simple.cc
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file graph/transform/to_simple.cc
+ * \brief Convert multigraphs to simple graphs
+ */
+
+#include <dgl/base_heterograph.h>
+#include <dgl/transform.h>
+#include <dgl/array.h>
+#include <dgl/packed_func_ext.h>
+#include <vector>
+#include <utility>
+#include "../unit_graph.h"
+#include "../../c_api_common.h"
+
+namespace dgl {
+
+using namespace dgl::runtime;
+using namespace dgl::aten;
+
+namespace transform {
+
+std::tuple<HeteroGraphPtr, std::vector<IdArray>, std::vector<IdArray>>
+ToSimpleGraph(const HeteroGraphPtr graph) {
+  const int64_t num_etypes = graph->NumEdgeTypes();
+  const auto metagraph = graph->meta_graph();
+
+  std::vector<IdArray> counts(num_etypes), edge_maps(num_etypes);
+  std::vector<HeteroGraphPtr> rel_graphs(num_etypes);
+
+  for (int64_t etype = 0; etype < num_etypes; ++etype) {
+    const auto vtypes = graph->GetEndpointTypes(etype);
+    const COOMatrix adj = graph->GetCOOMatrix(etype);
+    const COOMatrix sorted_adj = COOSort(adj, true);
+    const IdArray eids_shuffled = sorted_adj.data;
+    const auto &coalesced_result = COOCoalesce(sorted_adj);
+    const COOMatrix &coalesced_adj = coalesced_result.first;
+    const IdArray &count = coalesced_result.second;
+
+    /*
+     * eids_shuffled actually already contains the mapping from old edge space to the
+     * new one:
+     *
+     * * eids_shuffled[0:count[0]] indicates the original edge IDs that coalesced into new
+     *   edge #0.
+     * * eids_shuffled[count[0]:count[0] + count[1]] indicates those that coalesced into
+     *   new edge #1.
+     * * eids_shuffled[count[0] + count[1]:count[0] + count[1] + count[2]] indicates those
+     *   that coalesced into new edge #2.
+     * * etc.
+     *
+     * Here, we need to translate eids_shuffled to an array "eids_remapped" such that
+     * eids_remapped[i] indicates the new edge ID the old edge #i is mapped to.  The
+     * translation can simply be achieved by (in numpy code):
+     *
+     *     new_eid_for_eids_shuffled = np.range(len(count)).repeat(count)
+     *     eids_remapped = np.zeros_like(new_eid_for_eids_shuffled)
+     *     eids_remapped[eids_shuffled] = new_eid_for_eids_shuffled
+     */
+    const IdArray new_eids = Range(
+        0, coalesced_adj.row->shape[0], coalesced_adj.row->dtype.bits, coalesced_adj.row->ctx);
+    const IdArray eids_remapped = Scatter(Repeat(new_eids, count), eids_shuffled);
+
+    edge_maps[etype] = eids_remapped;
+    counts[etype] = count;
+    rel_graphs[etype] = UnitGraph::CreateFromCOO(
+        vtypes.first == vtypes.second ? 1 : 2,
+        coalesced_adj.num_rows,
+        coalesced_adj.num_cols,
+        coalesced_adj.row,
+        coalesced_adj.col);
+  }
+
+  const HeteroGraphPtr result = CreateHeteroGraph(metagraph, rel_graphs);
+
+  return std::make_tuple(result, counts, edge_maps);
+}
+
+DGL_REGISTER_GLOBAL("transform._CAPI_DGLToSimpleHetero")
+.set_body([] (DGLArgs args, DGLRetValue *rv) {
+    const HeteroGraphRef graph_ref = args[0];
+
+    const auto result = ToSimpleGraph(graph_ref.sptr());
+
+    List<Value> counts, edge_maps;
+    for (const IdArray &count : std::get<1>(result))
+      counts.push_back(Value(MakeValue(count)));
+    for (const IdArray &edge_map : std::get<2>(result))
+      edge_maps.push_back(Value(MakeValue(edge_map)));
+
+    List<ObjectRef> ret;
+    ret.push_back(HeteroGraphRef(std::get<0>(result)));
+    ret.push_back(counts);
+    ret.push_back(edge_maps);
+
+    *rv = ret;
+  });
+
+};  // namespace transform
+
+};  // namespace dgl
--- a/tests/compute/test_heterograph.py
+++ b/tests/compute/test_heterograph.py
@@ -1365,51 +1365,6 @@ def test_empty_heterograph():
    assert g.number_of_nodes('developer') == 2


-def test_compact():
-    g1 = dgl.heterograph({
-        ('user', 'follow', 'user'): [(1, 3), (3, 5)],
-        ('user', 'plays', 'game'): [(2, 4), (3, 4), (2, 5)],
-        ('game', 'wished-by', 'user'): [(6, 7), (5, 7)]},
-        {'user': 20, 'game': 10})
-
-    g2 = dgl.heterograph({
-        ('game', 'clicked-by', 'user'): [(3, 1)],
-        ('user', 'likes', 'user'): [(1, 8), (8, 9)]},
-        {'user': 20, 'game': 10})
-
-    def _check(g, new_g, induced_nodes):
-        assert g.ntypes == new_g.ntypes
-        assert g.canonical_etypes == new_g.canonical_etypes
-
-        for ntype in g.ntypes:
-            assert -1 not in induced_nodes[ntype]
-
-        for etype in g.canonical_etypes:
-            g_src, g_dst = g.all_edges(order='eid', etype=etype)
-            g_src = F.asnumpy(g_src)
-            g_dst = F.asnumpy(g_dst)
-            new_g_src, new_g_dst = new_g.all_edges(order='eid', etype=etype)
-            new_g_src_mapped = induced_nodes[etype[0]][F.asnumpy(new_g_src)]
-            new_g_dst_mapped = induced_nodes[etype[2]][F.asnumpy(new_g_dst)]
-            assert (g_src == new_g_src_mapped).all()
-            assert (g_dst == new_g_dst_mapped).all()
-
-    new_g1 = dgl.compact_graphs(g1)
-    induced_nodes = {ntype: new_g1.nodes[ntype].data[dgl.NID] for ntype in new_g1.ntypes}
-    induced_nodes = {k: F.asnumpy(v) for k, v in induced_nodes.items()}
-    assert set(induced_nodes['user']) == set([1, 3, 5, 2, 7])
-    assert set(induced_nodes['game']) == set([4, 5, 6])
-    _check(g1, new_g1, induced_nodes)
-
-    new_g1, new_g2 = dgl.compact_graphs([g1, g2])
-    induced_nodes = {ntype: new_g1.nodes[ntype].data[dgl.NID] for ntype in new_g1.ntypes}
-    induced_nodes = {k: F.asnumpy(v) for k, v in induced_nodes.items()}
-    assert set(induced_nodes['user']) == set([1, 3, 5, 2, 7, 8, 9])
-    assert set(induced_nodes['game']) == set([3, 4, 5, 6])
-    _check(g1, new_g1, induced_nodes)
-    _check(g2, new_g2, induced_nodes)
-
-
 def test_types_in_function():
    def mfunc1(edges):
        assert edges.canonical_etype == ('user', 'follow', 'user')
@@ -1513,6 +1468,5 @@ if __name__ == '__main__':
    test_updates()
    test_backward()
    test_empty_heterograph()
-    test_compact()
    test_types_in_function()
    test_stack_reduce()
--- a/tests/compute/test_sampling.py
+++ b/tests/compute/test_sampling.py
@@ -37,6 +37,7 @@ def test_random_walk():
        ('item', 'viewed-by', 'user'): [(0, 0), (1, 0), (1, 1), (2, 2), (2, 3), (1, 3)]})

    g2.edata['p'] = F.tensor([3, 0, 3, 3, 3], dtype=F.float32)
+    g2.edata['p2'] = F.tensor([[3], [0], [3], [3], [3]], dtype=F.float32)
    g4.edges['follow'].data['p'] = F.tensor([3, 0, 3, 3, 3], dtype=F.float32)
    g4.edges['viewed-by'].data['p'] = F.tensor([1, 1, 1, 1, 1, 1], dtype=F.float32)

@@ -62,6 +63,14 @@ def test_random_walk():
        g2, [0, 1, 2, 3, 0, 1, 2, 3], length=4, prob='p')
    check_random_walk(g2, ['follow'] * 4, traces, ntypes, 'p')

+    try:
+        traces, ntypes = dgl.sampling.random_walk(
+            g2, [0, 1, 2, 3, 0, 1, 2, 3], length=4, prob='p2')
+        fail = False
+    except dgl.DGLError:
+        fail = True
+    assert fail
+
    metapath = ['follow', 'view', 'viewed-by'] * 2
    traces, ntypes = dgl.sampling.random_walk(
        g3, [0, 1, 2, 0, 1, 2], metapath=metapath)
@@ -103,6 +112,35 @@ def test_pack_traces():
    assert F.array_equal(result[2], F.tensor([2, 7], dtype=F.int64))
    assert F.array_equal(result[3], F.tensor([0, 2], dtype=F.int64))

+def test_pinsage_sampling():
+    def _test_sampler(g, sampler, ntype):
+        neighbor_g = sampler(F.tensor([0, 2], dtype=F.int64))
+        assert neighbor_g.ntypes == [ntype]
+        u, v = neighbor_g.all_edges(form='uv', order='eid')
+        uv = list(zip(F.asnumpy(u).tolist(), F.asnumpy(v).tolist()))
+        assert (1, 0) in uv or (0, 0) in uv
+        assert (2, 2) in uv or (3, 2) in uv
+
+    g = dgl.heterograph({
+        ('item', 'bought-by', 'user'): [(0, 0), (0, 1), (1, 0), (1, 1), (2, 2), (2, 3), (3, 2), (3, 3)],
+        ('user', 'bought', 'item'): [(0, 0), (1, 0), (0, 1), (1, 1), (2, 2), (3, 2), (2, 3), (3, 3)]})
+    sampler = dgl.sampling.PinSAGESampler(g, 'item', 'user', 4, 0.5, 3, 2)
+    _test_sampler(g, sampler, 'item')
+    sampler = dgl.sampling.RandomWalkNeighborSampler(g, 4, 0.5, 3, 2, ['bought-by', 'bought'])
+    _test_sampler(g, sampler, 'item')
+    sampler = dgl.sampling.RandomWalkNeighborSampler(g, 4, 0.5, 3, 2, 
+        [('item', 'bought-by', 'user'), ('user', 'bought', 'item')])
+    _test_sampler(g, sampler, 'item')
+    g = dgl.graph([(0, 0), (0, 1), (1, 0), (1, 1), (2, 2), (2, 3), (3, 2), (3, 3)])
+    sampler = dgl.sampling.RandomWalkNeighborSampler(g, 4, 0.5, 3, 2)
+    _test_sampler(g, sampler, g.ntypes[0])
+    g = dgl.heterograph({
+        ('A', 'AB', 'B'): [(0, 1), (2, 3)],
+        ('B', 'BC', 'C'): [(1, 2), (3, 1)],
+        ('C', 'CA', 'A'): [(2, 0), (1, 2)]})
+    sampler = dgl.sampling.RandomWalkNeighborSampler(g, 4, 0.5, 3, 2, ['AB', 'BC', 'CA'])
+    _test_sampler(g, sampler, 'A')
+
 def _gen_neighbor_sampling_test_graph(hypersparse, reverse):
    if hypersparse:
        # should crash if allocated a CSR
@@ -305,7 +343,7 @@ def _test_sample_neighbors_topk(hypersparse):
    g, hg = _gen_neighbor_topk_test_graph(hypersparse, False)

    def _test1():
-        subg = dgl.sampling.sample_neighbors_topk(g, [0, 1], 2, 'weight')
+        subg = dgl.sampling.select_topk(g, 2, 'weight', [0, 1])
        assert subg.number_of_nodes() == g.number_of_nodes()
        assert subg.number_of_edges() == 4
        u, v = subg.edges()
@@ -315,7 +353,7 @@ def _test_sample_neighbors_topk(hypersparse):
    _test1()

    def _test2():  # k > #neighbors
-        subg = dgl.sampling.sample_neighbors_topk(g, [0, 2], 2, 'weight')
+        subg = dgl.sampling.select_topk(g, 2, 'weight', [0, 2])
        assert subg.number_of_nodes() == g.number_of_nodes()
        assert subg.number_of_edges() == 3
        u, v = subg.edges()
@@ -325,7 +363,7 @@ def _test_sample_neighbors_topk(hypersparse):
    _test2()

    def _test3():
-        subg = dgl.sampling.sample_neighbors_topk(hg, {'user' : [0,1], 'game' : 0}, 2, 'weight')
+        subg = dgl.sampling.select_topk(hg, 2, 'weight', {'user' : [0,1], 'game' : 0})
        assert len(subg.ntypes) == 3
        assert len(subg.etypes) == 4
        u, v = subg['follow'].edges()
@@ -344,7 +382,7 @@ def _test_sample_neighbors_topk(hypersparse):
    _test3()

    # test different k for different relations
-    subg = dgl.sampling.sample_neighbors_topk(hg, {'user' : [0,1], 'game' : 0}, [1, 2, 0, 2], 'weight')
+    subg = dgl.sampling.select_topk(hg, [1, 2, 0, 2], 'weight', {'user' : [0,1], 'game' : 0})
    assert len(subg.ntypes) == 3
    assert len(subg.etypes) == 4
    assert subg['follow'].number_of_edges() == 2
@@ -356,7 +394,7 @@ def _test_sample_neighbors_topk_outedge(hypersparse):
    g, hg = _gen_neighbor_topk_test_graph(hypersparse, True)

    def _test1():
-        subg = dgl.sampling.sample_neighbors_topk(g, [0, 1], 2, 'weight', edge_dir='out')
+        subg = dgl.sampling.select_topk(g, 2, 'weight', [0, 1], edge_dir='out')
        assert subg.number_of_nodes() == g.number_of_nodes()
        assert subg.number_of_edges() == 4
        u, v = subg.edges()
@@ -366,7 +404,7 @@ def _test_sample_neighbors_topk_outedge(hypersparse):
    _test1()

    def _test2():  # k > #neighbors
-        subg = dgl.sampling.sample_neighbors_topk(g, [0, 2], 2, 'weight', edge_dir='out')
+        subg = dgl.sampling.select_topk(g, 2, 'weight', [0, 2], edge_dir='out')
        assert subg.number_of_nodes() == g.number_of_nodes()
        assert subg.number_of_edges() == 3
        u, v = subg.edges()
@@ -376,7 +414,7 @@ def _test_sample_neighbors_topk_outedge(hypersparse):
    _test2()

    def _test3():
-        subg = dgl.sampling.sample_neighbors_topk(hg, {'user' : [0,1], 'game' : 0}, 2, 'weight', edge_dir='out')
+        subg = dgl.sampling.select_topk(hg, 2, 'weight', {'user' : [0,1], 'game' : 0}, edge_dir='out')
        assert len(subg.ntypes) == 3
        assert len(subg.etypes) == 4
        u, v = subg['follow'].edges()
@@ -417,6 +455,7 @@ def test_sample_neighbors_topk_outedge():
 if __name__ == '__main__':
    test_random_walk()
    test_pack_traces()
+    test_pinsage_sampling()
    test_sample_neighbors()
    test_sample_neighbors_outedge()
    test_sample_neighbors_topk()

--- a/tests/compute/test_transform.py
+++ b/tests/compute/test_transform.py
 from scipy import sparse as spsp
+import unittest
 import networkx as nx
 import numpy as np
 import dgl
@@ -282,6 +283,119 @@ def test_out_subgraph():
    assert edge_set == {(0,0),(1,0)}
    assert F.array_equal(hg['flips'].edge_ids(u, v), subg['flips'].edata[dgl.EID])

+@unittest.skipIf(F._default_context_str == 'gpu', reason="GPU compaction not implemented")
+def test_compact():
+    g1 = dgl.heterograph({
+        ('user', 'follow', 'user'): [(1, 3), (3, 5)],
+        ('user', 'plays', 'game'): [(2, 4), (3, 4), (2, 5)],
+        ('game', 'wished-by', 'user'): [(6, 7), (5, 7)]},
+        {'user': 20, 'game': 10})
+
+    g2 = dgl.heterograph({
+        ('game', 'clicked-by', 'user'): [(3, 1)],
+        ('user', 'likes', 'user'): [(1, 8), (8, 9)]},
+        {'user': 20, 'game': 10})
+
+    g3 = dgl.graph([(0, 1), (1, 2)], card=10, ntype='user')
+    g4 = dgl.graph([(1, 3), (3, 5)], card=10, ntype='user')
+
+    def _check(g, new_g, induced_nodes):
+        assert g.ntypes == new_g.ntypes
+        assert g.canonical_etypes == new_g.canonical_etypes
+
+        for ntype in g.ntypes:
+            assert -1 not in induced_nodes[ntype]
+
+        for etype in g.canonical_etypes:
+            g_src, g_dst = g.all_edges(order='eid', etype=etype)
+            g_src = F.asnumpy(g_src)
+            g_dst = F.asnumpy(g_dst)
+            new_g_src, new_g_dst = new_g.all_edges(order='eid', etype=etype)
+            new_g_src_mapped = induced_nodes[etype[0]][F.asnumpy(new_g_src)]
+            new_g_dst_mapped = induced_nodes[etype[2]][F.asnumpy(new_g_dst)]
+            assert (g_src == new_g_src_mapped).all()
+            assert (g_dst == new_g_dst_mapped).all()
+
+    # Test default
+    new_g1 = dgl.compact_graphs(g1)
+    induced_nodes = {ntype: new_g1.nodes[ntype].data[dgl.NID] for ntype in new_g1.ntypes}
+    induced_nodes = {k: F.asnumpy(v) for k, v in induced_nodes.items()}
+    assert set(induced_nodes['user']) == set([1, 3, 5, 2, 7])
+    assert set(induced_nodes['game']) == set([4, 5, 6])
+    _check(g1, new_g1, induced_nodes)
+
+    # Test with always_preserve given a dict
+    new_g1 = dgl.compact_graphs(
+        g1, always_preserve={'game': F.tensor([4, 7], dtype=F.int64)})
+    induced_nodes = {ntype: new_g1.nodes[ntype].data[dgl.NID] for ntype in new_g1.ntypes}
+    induced_nodes = {k: F.asnumpy(v) for k, v in induced_nodes.items()}
+    assert set(induced_nodes['user']) == set([1, 3, 5, 2, 7])
+    assert set(induced_nodes['game']) == set([4, 5, 6, 7])
+    _check(g1, new_g1, induced_nodes)
+
+    # Test with always_preserve given a tensor
+    new_g3 = dgl.compact_graphs(
+        g3, always_preserve=F.tensor([1, 7], dtype=F.int64))
+    induced_nodes = {ntype: new_g3.nodes[ntype].data[dgl.NID] for ntype in new_g3.ntypes}
+    induced_nodes = {k: F.asnumpy(v) for k, v in induced_nodes.items()}
+    assert set(induced_nodes['user']) == set([0, 1, 2, 7])
+    _check(g3, new_g3, induced_nodes)
+
+    # Test multiple graphs
+    new_g1, new_g2 = dgl.compact_graphs([g1, g2])
+    induced_nodes = {ntype: new_g1.nodes[ntype].data[dgl.NID] for ntype in new_g1.ntypes}
+    induced_nodes = {k: F.asnumpy(v) for k, v in induced_nodes.items()}
+    assert set(induced_nodes['user']) == set([1, 3, 5, 2, 7, 8, 9])
+    assert set(induced_nodes['game']) == set([3, 4, 5, 6])
+    _check(g1, new_g1, induced_nodes)
+    _check(g2, new_g2, induced_nodes)
+
+    # Test multiple graphs with always_preserve given a dict
+    new_g1, new_g2 = dgl.compact_graphs(
+        [g1, g2], always_preserve={'game': F.tensor([4, 7], dtype=F.int64)})
+    induced_nodes = {ntype: new_g1.nodes[ntype].data[dgl.NID] for ntype in new_g1.ntypes}
+    induced_nodes = {k: F.asnumpy(v) for k, v in induced_nodes.items()}
+    assert set(induced_nodes['user']) == set([1, 3, 5, 2, 7, 8, 9])
+    assert set(induced_nodes['game']) == set([3, 4, 5, 6, 7])
+    _check(g1, new_g1, induced_nodes)
+    _check(g2, new_g2, induced_nodes)
+
+    # Test multiple graphs with always_preserve given a tensor
+    new_g3, new_g4 = dgl.compact_graphs(
+        [g3, g4], always_preserve=F.tensor([1, 7], dtype=F.int64))
+    induced_nodes = {ntype: new_g3.nodes[ntype].data[dgl.NID] for ntype in new_g3.ntypes}
+    induced_nodes = {k: F.asnumpy(v) for k, v in induced_nodes.items()}
+    assert set(induced_nodes['user']) == set([0, 1, 2, 3, 5, 7])
+    _check(g3, new_g3, induced_nodes)
+    _check(g4, new_g4, induced_nodes)
+
+
+def test_to_simple():
+    g = dgl.heterograph({
+        ('user', 'follow', 'user'): [(0, 1), (1, 3), (2, 2), (1, 3), (1, 4), (1, 4)],
+        ('user', 'plays', 'game'): [(3, 5), (2, 3), (1, 4), (1, 4), (3, 5), (2, 3), (2, 3)]})
+    sg = dgl.to_simple(g, return_counts='weights', writeback_mapping='new_eid')
+
+    for etype in g.canonical_etypes:
+        u, v = g.all_edges(form='uv', order='eid', etype=etype)
+        u = F.asnumpy(u).tolist()
+        v = F.asnumpy(v).tolist()
+        uv = list(zip(u, v))
+        eid_map = F.asnumpy(g.edges[etype].data['new_eid'])
+
+        su, sv = sg.all_edges(form='uv', order='eid', etype=etype)
+        su = F.asnumpy(su).tolist()
+        sv = F.asnumpy(sv).tolist()
+        suv = list(zip(su, sv))
+        sw = F.asnumpy(sg.edges[etype].data['weights'])
+
+        assert set(uv) == set(suv)
+        for i, e in enumerate(suv):
+            assert sw[i] == sum(e == _e for _e in uv)
+        for i, e in enumerate(uv):
+            assert eid_map[i] == suv.index(e)
+
+
 if __name__ == '__main__':
    test_line_graph()
    test_no_backtracking()
@@ -295,5 +409,7 @@ if __name__ == '__main__':
    test_remove_self_loop()
    test_add_self_loop()
    test_partition()
+    test_compact()
+    test_to_simple()
    test_in_subgraph()
    test_out_subgraph()