Unverified Commit c3a33407 authored by Quan (Andy) Gan's avatar Quan (Andy) Gan Committed by GitHub
Browse files

[Feature] PinSAGE sampler (#1249)



* [WIP] PinSAGE operators

* moved the edge remapping mess into C

* some docstrings

* lint

* lint x2

* lint x3

* skip gpu test on topk

* extend pinsage to any metapath

* lint x4

* addresses #1265

* add always_preserve (fixes #1266) and fix a silly bug

* disable gpu test on compaction

* lint

* fix a horrible bug and add more tests

* lint

* addresses comments

* lint

* bugfix

* addresses comments
Co-authored-by: default avatarMinjie Wang <minjie.wang@nyu.edu>
parent 87bca129
/*!
* Copyright (c) 2019 by Contributors
* \file graph/transform/to_simple.cc
* \brief Convert multigraphs to simple graphs
*/
#include <dgl/base_heterograph.h>
#include <dgl/transform.h>
#include <dgl/array.h>
#include <dgl/packed_func_ext.h>
#include <vector>
#include <utility>
#include "../unit_graph.h"
#include "../../c_api_common.h"
namespace dgl {
using namespace dgl::runtime;
using namespace dgl::aten;
namespace transform {
std::tuple<HeteroGraphPtr, std::vector<IdArray>, std::vector<IdArray>>
ToSimpleGraph(const HeteroGraphPtr graph) {
const int64_t num_etypes = graph->NumEdgeTypes();
const auto metagraph = graph->meta_graph();
std::vector<IdArray> counts(num_etypes), edge_maps(num_etypes);
std::vector<HeteroGraphPtr> rel_graphs(num_etypes);
for (int64_t etype = 0; etype < num_etypes; ++etype) {
const auto vtypes = graph->GetEndpointTypes(etype);
const COOMatrix adj = graph->GetCOOMatrix(etype);
const COOMatrix sorted_adj = COOSort(adj, true);
const IdArray eids_shuffled = sorted_adj.data;
const auto &coalesced_result = COOCoalesce(sorted_adj);
const COOMatrix &coalesced_adj = coalesced_result.first;
const IdArray &count = coalesced_result.second;
/*
* eids_shuffled actually already contains the mapping from old edge space to the
* new one:
*
* * eids_shuffled[0:count[0]] indicates the original edge IDs that coalesced into new
* edge #0.
* * eids_shuffled[count[0]:count[0] + count[1]] indicates those that coalesced into
* new edge #1.
* * eids_shuffled[count[0] + count[1]:count[0] + count[1] + count[2]] indicates those
* that coalesced into new edge #2.
* * etc.
*
* Here, we need to translate eids_shuffled to an array "eids_remapped" such that
* eids_remapped[i] indicates the new edge ID the old edge #i is mapped to. The
* translation can simply be achieved by (in numpy code):
*
* new_eid_for_eids_shuffled = np.range(len(count)).repeat(count)
* eids_remapped = np.zeros_like(new_eid_for_eids_shuffled)
* eids_remapped[eids_shuffled] = new_eid_for_eids_shuffled
*/
const IdArray new_eids = Range(
0, coalesced_adj.row->shape[0], coalesced_adj.row->dtype.bits, coalesced_adj.row->ctx);
const IdArray eids_remapped = Scatter(Repeat(new_eids, count), eids_shuffled);
edge_maps[etype] = eids_remapped;
counts[etype] = count;
rel_graphs[etype] = UnitGraph::CreateFromCOO(
vtypes.first == vtypes.second ? 1 : 2,
coalesced_adj.num_rows,
coalesced_adj.num_cols,
coalesced_adj.row,
coalesced_adj.col);
}
const HeteroGraphPtr result = CreateHeteroGraph(metagraph, rel_graphs);
return std::make_tuple(result, counts, edge_maps);
}
DGL_REGISTER_GLOBAL("transform._CAPI_DGLToSimpleHetero")
.set_body([] (DGLArgs args, DGLRetValue *rv) {
const HeteroGraphRef graph_ref = args[0];
const auto result = ToSimpleGraph(graph_ref.sptr());
List<Value> counts, edge_maps;
for (const IdArray &count : std::get<1>(result))
counts.push_back(Value(MakeValue(count)));
for (const IdArray &edge_map : std::get<2>(result))
edge_maps.push_back(Value(MakeValue(edge_map)));
List<ObjectRef> ret;
ret.push_back(HeteroGraphRef(std::get<0>(result)));
ret.push_back(counts);
ret.push_back(edge_maps);
*rv = ret;
});
}; // namespace transform
}; // namespace dgl
......@@ -1365,51 +1365,6 @@ def test_empty_heterograph():
assert g.number_of_nodes('developer') == 2
def test_compact():
g1 = dgl.heterograph({
('user', 'follow', 'user'): [(1, 3), (3, 5)],
('user', 'plays', 'game'): [(2, 4), (3, 4), (2, 5)],
('game', 'wished-by', 'user'): [(6, 7), (5, 7)]},
{'user': 20, 'game': 10})
g2 = dgl.heterograph({
('game', 'clicked-by', 'user'): [(3, 1)],
('user', 'likes', 'user'): [(1, 8), (8, 9)]},
{'user': 20, 'game': 10})
def _check(g, new_g, induced_nodes):
assert g.ntypes == new_g.ntypes
assert g.canonical_etypes == new_g.canonical_etypes
for ntype in g.ntypes:
assert -1 not in induced_nodes[ntype]
for etype in g.canonical_etypes:
g_src, g_dst = g.all_edges(order='eid', etype=etype)
g_src = F.asnumpy(g_src)
g_dst = F.asnumpy(g_dst)
new_g_src, new_g_dst = new_g.all_edges(order='eid', etype=etype)
new_g_src_mapped = induced_nodes[etype[0]][F.asnumpy(new_g_src)]
new_g_dst_mapped = induced_nodes[etype[2]][F.asnumpy(new_g_dst)]
assert (g_src == new_g_src_mapped).all()
assert (g_dst == new_g_dst_mapped).all()
new_g1 = dgl.compact_graphs(g1)
induced_nodes = {ntype: new_g1.nodes[ntype].data[dgl.NID] for ntype in new_g1.ntypes}
induced_nodes = {k: F.asnumpy(v) for k, v in induced_nodes.items()}
assert set(induced_nodes['user']) == set([1, 3, 5, 2, 7])
assert set(induced_nodes['game']) == set([4, 5, 6])
_check(g1, new_g1, induced_nodes)
new_g1, new_g2 = dgl.compact_graphs([g1, g2])
induced_nodes = {ntype: new_g1.nodes[ntype].data[dgl.NID] for ntype in new_g1.ntypes}
induced_nodes = {k: F.asnumpy(v) for k, v in induced_nodes.items()}
assert set(induced_nodes['user']) == set([1, 3, 5, 2, 7, 8, 9])
assert set(induced_nodes['game']) == set([3, 4, 5, 6])
_check(g1, new_g1, induced_nodes)
_check(g2, new_g2, induced_nodes)
def test_types_in_function():
def mfunc1(edges):
assert edges.canonical_etype == ('user', 'follow', 'user')
......@@ -1513,6 +1468,5 @@ if __name__ == '__main__':
test_updates()
test_backward()
test_empty_heterograph()
test_compact()
test_types_in_function()
test_stack_reduce()
......@@ -37,6 +37,7 @@ def test_random_walk():
('item', 'viewed-by', 'user'): [(0, 0), (1, 0), (1, 1), (2, 2), (2, 3), (1, 3)]})
g2.edata['p'] = F.tensor([3, 0, 3, 3, 3], dtype=F.float32)
g2.edata['p2'] = F.tensor([[3], [0], [3], [3], [3]], dtype=F.float32)
g4.edges['follow'].data['p'] = F.tensor([3, 0, 3, 3, 3], dtype=F.float32)
g4.edges['viewed-by'].data['p'] = F.tensor([1, 1, 1, 1, 1, 1], dtype=F.float32)
......@@ -62,6 +63,14 @@ def test_random_walk():
g2, [0, 1, 2, 3, 0, 1, 2, 3], length=4, prob='p')
check_random_walk(g2, ['follow'] * 4, traces, ntypes, 'p')
try:
traces, ntypes = dgl.sampling.random_walk(
g2, [0, 1, 2, 3, 0, 1, 2, 3], length=4, prob='p2')
fail = False
except dgl.DGLError:
fail = True
assert fail
metapath = ['follow', 'view', 'viewed-by'] * 2
traces, ntypes = dgl.sampling.random_walk(
g3, [0, 1, 2, 0, 1, 2], metapath=metapath)
......@@ -103,6 +112,35 @@ def test_pack_traces():
assert F.array_equal(result[2], F.tensor([2, 7], dtype=F.int64))
assert F.array_equal(result[3], F.tensor([0, 2], dtype=F.int64))
def test_pinsage_sampling():
def _test_sampler(g, sampler, ntype):
neighbor_g = sampler(F.tensor([0, 2], dtype=F.int64))
assert neighbor_g.ntypes == [ntype]
u, v = neighbor_g.all_edges(form='uv', order='eid')
uv = list(zip(F.asnumpy(u).tolist(), F.asnumpy(v).tolist()))
assert (1, 0) in uv or (0, 0) in uv
assert (2, 2) in uv or (3, 2) in uv
g = dgl.heterograph({
('item', 'bought-by', 'user'): [(0, 0), (0, 1), (1, 0), (1, 1), (2, 2), (2, 3), (3, 2), (3, 3)],
('user', 'bought', 'item'): [(0, 0), (1, 0), (0, 1), (1, 1), (2, 2), (3, 2), (2, 3), (3, 3)]})
sampler = dgl.sampling.PinSAGESampler(g, 'item', 'user', 4, 0.5, 3, 2)
_test_sampler(g, sampler, 'item')
sampler = dgl.sampling.RandomWalkNeighborSampler(g, 4, 0.5, 3, 2, ['bought-by', 'bought'])
_test_sampler(g, sampler, 'item')
sampler = dgl.sampling.RandomWalkNeighborSampler(g, 4, 0.5, 3, 2,
[('item', 'bought-by', 'user'), ('user', 'bought', 'item')])
_test_sampler(g, sampler, 'item')
g = dgl.graph([(0, 0), (0, 1), (1, 0), (1, 1), (2, 2), (2, 3), (3, 2), (3, 3)])
sampler = dgl.sampling.RandomWalkNeighborSampler(g, 4, 0.5, 3, 2)
_test_sampler(g, sampler, g.ntypes[0])
g = dgl.heterograph({
('A', 'AB', 'B'): [(0, 1), (2, 3)],
('B', 'BC', 'C'): [(1, 2), (3, 1)],
('C', 'CA', 'A'): [(2, 0), (1, 2)]})
sampler = dgl.sampling.RandomWalkNeighborSampler(g, 4, 0.5, 3, 2, ['AB', 'BC', 'CA'])
_test_sampler(g, sampler, 'A')
def _gen_neighbor_sampling_test_graph(hypersparse, reverse):
if hypersparse:
# should crash if allocated a CSR
......@@ -305,7 +343,7 @@ def _test_sample_neighbors_topk(hypersparse):
g, hg = _gen_neighbor_topk_test_graph(hypersparse, False)
def _test1():
subg = dgl.sampling.sample_neighbors_topk(g, [0, 1], 2, 'weight')
subg = dgl.sampling.select_topk(g, 2, 'weight', [0, 1])
assert subg.number_of_nodes() == g.number_of_nodes()
assert subg.number_of_edges() == 4
u, v = subg.edges()
......@@ -315,7 +353,7 @@ def _test_sample_neighbors_topk(hypersparse):
_test1()
def _test2(): # k > #neighbors
subg = dgl.sampling.sample_neighbors_topk(g, [0, 2], 2, 'weight')
subg = dgl.sampling.select_topk(g, 2, 'weight', [0, 2])
assert subg.number_of_nodes() == g.number_of_nodes()
assert subg.number_of_edges() == 3
u, v = subg.edges()
......@@ -325,7 +363,7 @@ def _test_sample_neighbors_topk(hypersparse):
_test2()
def _test3():
subg = dgl.sampling.sample_neighbors_topk(hg, {'user' : [0,1], 'game' : 0}, 2, 'weight')
subg = dgl.sampling.select_topk(hg, 2, 'weight', {'user' : [0,1], 'game' : 0})
assert len(subg.ntypes) == 3
assert len(subg.etypes) == 4
u, v = subg['follow'].edges()
......@@ -344,7 +382,7 @@ def _test_sample_neighbors_topk(hypersparse):
_test3()
# test different k for different relations
subg = dgl.sampling.sample_neighbors_topk(hg, {'user' : [0,1], 'game' : 0}, [1, 2, 0, 2], 'weight')
subg = dgl.sampling.select_topk(hg, [1, 2, 0, 2], 'weight', {'user' : [0,1], 'game' : 0})
assert len(subg.ntypes) == 3
assert len(subg.etypes) == 4
assert subg['follow'].number_of_edges() == 2
......@@ -356,7 +394,7 @@ def _test_sample_neighbors_topk_outedge(hypersparse):
g, hg = _gen_neighbor_topk_test_graph(hypersparse, True)
def _test1():
subg = dgl.sampling.sample_neighbors_topk(g, [0, 1], 2, 'weight', edge_dir='out')
subg = dgl.sampling.select_topk(g, 2, 'weight', [0, 1], edge_dir='out')
assert subg.number_of_nodes() == g.number_of_nodes()
assert subg.number_of_edges() == 4
u, v = subg.edges()
......@@ -366,7 +404,7 @@ def _test_sample_neighbors_topk_outedge(hypersparse):
_test1()
def _test2(): # k > #neighbors
subg = dgl.sampling.sample_neighbors_topk(g, [0, 2], 2, 'weight', edge_dir='out')
subg = dgl.sampling.select_topk(g, 2, 'weight', [0, 2], edge_dir='out')
assert subg.number_of_nodes() == g.number_of_nodes()
assert subg.number_of_edges() == 3
u, v = subg.edges()
......@@ -376,7 +414,7 @@ def _test_sample_neighbors_topk_outedge(hypersparse):
_test2()
def _test3():
subg = dgl.sampling.sample_neighbors_topk(hg, {'user' : [0,1], 'game' : 0}, 2, 'weight', edge_dir='out')
subg = dgl.sampling.select_topk(hg, 2, 'weight', {'user' : [0,1], 'game' : 0}, edge_dir='out')
assert len(subg.ntypes) == 3
assert len(subg.etypes) == 4
u, v = subg['follow'].edges()
......@@ -417,6 +455,7 @@ def test_sample_neighbors_topk_outedge():
if __name__ == '__main__':
test_random_walk()
test_pack_traces()
test_pinsage_sampling()
test_sample_neighbors()
test_sample_neighbors_outedge()
test_sample_neighbors_topk()
......
from scipy import sparse as spsp
import unittest
import networkx as nx
import numpy as np
import dgl
......@@ -282,6 +283,119 @@ def test_out_subgraph():
assert edge_set == {(0,0),(1,0)}
assert F.array_equal(hg['flips'].edge_ids(u, v), subg['flips'].edata[dgl.EID])
@unittest.skipIf(F._default_context_str == 'gpu', reason="GPU compaction not implemented")
def test_compact():
g1 = dgl.heterograph({
('user', 'follow', 'user'): [(1, 3), (3, 5)],
('user', 'plays', 'game'): [(2, 4), (3, 4), (2, 5)],
('game', 'wished-by', 'user'): [(6, 7), (5, 7)]},
{'user': 20, 'game': 10})
g2 = dgl.heterograph({
('game', 'clicked-by', 'user'): [(3, 1)],
('user', 'likes', 'user'): [(1, 8), (8, 9)]},
{'user': 20, 'game': 10})
g3 = dgl.graph([(0, 1), (1, 2)], card=10, ntype='user')
g4 = dgl.graph([(1, 3), (3, 5)], card=10, ntype='user')
def _check(g, new_g, induced_nodes):
assert g.ntypes == new_g.ntypes
assert g.canonical_etypes == new_g.canonical_etypes
for ntype in g.ntypes:
assert -1 not in induced_nodes[ntype]
for etype in g.canonical_etypes:
g_src, g_dst = g.all_edges(order='eid', etype=etype)
g_src = F.asnumpy(g_src)
g_dst = F.asnumpy(g_dst)
new_g_src, new_g_dst = new_g.all_edges(order='eid', etype=etype)
new_g_src_mapped = induced_nodes[etype[0]][F.asnumpy(new_g_src)]
new_g_dst_mapped = induced_nodes[etype[2]][F.asnumpy(new_g_dst)]
assert (g_src == new_g_src_mapped).all()
assert (g_dst == new_g_dst_mapped).all()
# Test default
new_g1 = dgl.compact_graphs(g1)
induced_nodes = {ntype: new_g1.nodes[ntype].data[dgl.NID] for ntype in new_g1.ntypes}
induced_nodes = {k: F.asnumpy(v) for k, v in induced_nodes.items()}
assert set(induced_nodes['user']) == set([1, 3, 5, 2, 7])
assert set(induced_nodes['game']) == set([4, 5, 6])
_check(g1, new_g1, induced_nodes)
# Test with always_preserve given a dict
new_g1 = dgl.compact_graphs(
g1, always_preserve={'game': F.tensor([4, 7], dtype=F.int64)})
induced_nodes = {ntype: new_g1.nodes[ntype].data[dgl.NID] for ntype in new_g1.ntypes}
induced_nodes = {k: F.asnumpy(v) for k, v in induced_nodes.items()}
assert set(induced_nodes['user']) == set([1, 3, 5, 2, 7])
assert set(induced_nodes['game']) == set([4, 5, 6, 7])
_check(g1, new_g1, induced_nodes)
# Test with always_preserve given a tensor
new_g3 = dgl.compact_graphs(
g3, always_preserve=F.tensor([1, 7], dtype=F.int64))
induced_nodes = {ntype: new_g3.nodes[ntype].data[dgl.NID] for ntype in new_g3.ntypes}
induced_nodes = {k: F.asnumpy(v) for k, v in induced_nodes.items()}
assert set(induced_nodes['user']) == set([0, 1, 2, 7])
_check(g3, new_g3, induced_nodes)
# Test multiple graphs
new_g1, new_g2 = dgl.compact_graphs([g1, g2])
induced_nodes = {ntype: new_g1.nodes[ntype].data[dgl.NID] for ntype in new_g1.ntypes}
induced_nodes = {k: F.asnumpy(v) for k, v in induced_nodes.items()}
assert set(induced_nodes['user']) == set([1, 3, 5, 2, 7, 8, 9])
assert set(induced_nodes['game']) == set([3, 4, 5, 6])
_check(g1, new_g1, induced_nodes)
_check(g2, new_g2, induced_nodes)
# Test multiple graphs with always_preserve given a dict
new_g1, new_g2 = dgl.compact_graphs(
[g1, g2], always_preserve={'game': F.tensor([4, 7], dtype=F.int64)})
induced_nodes = {ntype: new_g1.nodes[ntype].data[dgl.NID] for ntype in new_g1.ntypes}
induced_nodes = {k: F.asnumpy(v) for k, v in induced_nodes.items()}
assert set(induced_nodes['user']) == set([1, 3, 5, 2, 7, 8, 9])
assert set(induced_nodes['game']) == set([3, 4, 5, 6, 7])
_check(g1, new_g1, induced_nodes)
_check(g2, new_g2, induced_nodes)
# Test multiple graphs with always_preserve given a tensor
new_g3, new_g4 = dgl.compact_graphs(
[g3, g4], always_preserve=F.tensor([1, 7], dtype=F.int64))
induced_nodes = {ntype: new_g3.nodes[ntype].data[dgl.NID] for ntype in new_g3.ntypes}
induced_nodes = {k: F.asnumpy(v) for k, v in induced_nodes.items()}
assert set(induced_nodes['user']) == set([0, 1, 2, 3, 5, 7])
_check(g3, new_g3, induced_nodes)
_check(g4, new_g4, induced_nodes)
def test_to_simple():
g = dgl.heterograph({
('user', 'follow', 'user'): [(0, 1), (1, 3), (2, 2), (1, 3), (1, 4), (1, 4)],
('user', 'plays', 'game'): [(3, 5), (2, 3), (1, 4), (1, 4), (3, 5), (2, 3), (2, 3)]})
sg = dgl.to_simple(g, return_counts='weights', writeback_mapping='new_eid')
for etype in g.canonical_etypes:
u, v = g.all_edges(form='uv', order='eid', etype=etype)
u = F.asnumpy(u).tolist()
v = F.asnumpy(v).tolist()
uv = list(zip(u, v))
eid_map = F.asnumpy(g.edges[etype].data['new_eid'])
su, sv = sg.all_edges(form='uv', order='eid', etype=etype)
su = F.asnumpy(su).tolist()
sv = F.asnumpy(sv).tolist()
suv = list(zip(su, sv))
sw = F.asnumpy(sg.edges[etype].data['weights'])
assert set(uv) == set(suv)
for i, e in enumerate(suv):
assert sw[i] == sum(e == _e for _e in uv)
for i, e in enumerate(uv):
assert eid_map[i] == suv.index(e)
if __name__ == '__main__':
test_line_graph()
test_no_backtracking()
......@@ -295,5 +409,7 @@ if __name__ == '__main__':
test_remove_self_loop()
test_add_self_loop()
test_partition()
test_compact()
test_to_simple()
test_in_subgraph()
test_out_subgraph()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment