Unverified Commit ca2a7e1c authored by Minjie Wang's avatar Minjie Wang Committed by GitHub
Browse files

[Refactor] Nodeflow, sampling, CAPI (#430)

* enable cython

* add helper function and data structure for void_p vector return

* move sampler from graph index to contrib.sampling

* WIP

* WIP

* refactor layer sampling

* pass tests

* fix lint

* fix graphsage

* remove comments

* pickle test

* fix comments

* update dev guide for cython build
parent 27e0e547
...@@ -2,7 +2,7 @@ import backend as F ...@@ -2,7 +2,7 @@ import backend as F
import numpy as np import numpy as np
import scipy as sp import scipy as sp
import dgl import dgl
from dgl.node_flow import create_full_node_flow from dgl.contrib.sampling.sampler import create_full_nodeflow, NeighborSampler
from dgl import utils from dgl import utils
import dgl.function as fn import dgl.function as fn
from functools import partial from functools import partial
...@@ -38,15 +38,13 @@ def test_self_loop(): ...@@ -38,15 +38,13 @@ def test_self_loop():
deg = F.ones(in_deg.shape, dtype=F.int64) * n deg = F.ones(in_deg.shape, dtype=F.int64) * n
assert F.array_equal(in_deg, deg) assert F.array_equal(in_deg, deg)
def create_mini_batch(g, num_hops, add_self_loop=False): def create_mini_batch(g, num_hops, add_self_loop=False):
seed_ids = np.array([0, 1, 2, 3]) seed_ids = np.array([0, 1, 2, 3])
seed_ids = utils.toindex(seed_ids) sampler = NeighborSampler(g, batch_size=4, expand_factor=g.number_of_nodes(),
sgi = g._graph.neighbor_sampling([seed_ids], g.number_of_nodes(), num_hops, num_hops=num_hops, seed_nodes=seed_ids, add_self_loop=add_self_loop)
"in", None, add_self_loop) nfs = list(sampler)
assert len(sgi) == 1 assert len(nfs) == 1
return dgl.node_flow.NodeFlow(g, sgi[0]) return nfs[0]
def check_basic(g, nf): def check_basic(g, nf):
num_nodes = 0 num_nodes = 0
...@@ -71,21 +69,16 @@ def check_basic(g, nf): ...@@ -71,21 +69,16 @@ def check_basic(g, nf):
def test_basic(): def test_basic():
num_layers = 2 num_layers = 2
g = generate_rand_graph(100, connect_more=True) g = generate_rand_graph(100, connect_more=True)
print(0, 0) nf = create_full_nodeflow(g, num_layers)
nf = create_full_node_flow(g, num_layers)
print(0, 1)
assert nf.number_of_nodes() == g.number_of_nodes() * (num_layers + 1) assert nf.number_of_nodes() == g.number_of_nodes() * (num_layers + 1)
assert nf.number_of_edges() == g.number_of_edges() * num_layers assert nf.number_of_edges() == g.number_of_edges() * num_layers
assert nf.num_layers == num_layers + 1 assert nf.num_layers == num_layers + 1
assert nf.layer_size(0) == g.number_of_nodes() assert nf.layer_size(0) == g.number_of_nodes()
assert nf.layer_size(1) == g.number_of_nodes() assert nf.layer_size(1) == g.number_of_nodes()
check_basic(g, nf) check_basic(g, nf)
print(0, 2)
parent_nids = F.arange(0, g.number_of_nodes()) parent_nids = F.arange(0, g.number_of_nodes())
nids = dgl.graph_index.map_to_nodeflow_nid(nf._graph, 0, nids = nf.map_from_parent_nid(0, parent_nids)
utils.toindex(parent_nids)).tousertensor()
print(0, 3)
assert F.array_equal(nids, parent_nids) assert F.array_equal(nids, parent_nids)
g = generate_rand_graph(100) g = generate_rand_graph(100)
...@@ -114,7 +107,7 @@ def check_apply_nodes(create_node_flow): ...@@ -114,7 +107,7 @@ def check_apply_nodes(create_node_flow):
def test_apply_nodes(): def test_apply_nodes():
check_apply_nodes(create_full_node_flow) check_apply_nodes(create_full_nodeflow)
check_apply_nodes(create_mini_batch) check_apply_nodes(create_mini_batch)
...@@ -132,7 +125,7 @@ def check_apply_edges(create_node_flow): ...@@ -132,7 +125,7 @@ def check_apply_edges(create_node_flow):
def test_apply_edges(): def test_apply_edges():
check_apply_edges(create_full_node_flow) check_apply_edges(create_full_nodeflow)
check_apply_edges(create_mini_batch) check_apply_edges(create_mini_batch)
...@@ -165,7 +158,7 @@ def check_flow_compute(create_node_flow): ...@@ -165,7 +158,7 @@ def check_flow_compute(create_node_flow):
def test_flow_compute(): def test_flow_compute():
check_flow_compute(create_full_node_flow) check_flow_compute(create_full_nodeflow)
check_flow_compute(create_mini_batch) check_flow_compute(create_mini_batch)
...@@ -187,7 +180,7 @@ def check_prop_flows(create_node_flow): ...@@ -187,7 +180,7 @@ def check_prop_flows(create_node_flow):
def test_prop_flows(): def test_prop_flows():
check_prop_flows(create_full_node_flow) check_prop_flows(create_full_nodeflow)
check_prop_flows(create_mini_batch) check_prop_flows(create_mini_batch)
......
import dgl import dgl
import dgl.contrib as contrib
from dgl.frame import Frame, FrameRef, Column from dgl.frame import Frame, FrameRef, Column
from dgl.graph_index import create_graph_index from dgl.graph_index import create_graph_index
from dgl.utils import toindex from dgl.utils import toindex
...@@ -75,6 +76,27 @@ def _assert_is_identical(g, g2): ...@@ -75,6 +76,27 @@ def _assert_is_identical(g, g2):
for k in g.edata: for k in g.edata:
assert F.allclose(g.edata[k], g2.edata[k]) assert F.allclose(g.edata[k], g2.edata[k])
def _assert_is_identical_nodeflow(nf1, nf2):
assert nf1.is_multigraph == nf2.is_multigraph
assert nf1.is_readonly == nf2.is_readonly
assert nf1.number_of_nodes() == nf2.number_of_nodes()
src, dst = nf1.all_edges()
src2, dst2 = nf2.all_edges()
assert F.array_equal(src, src2)
assert F.array_equal(dst, dst2)
assert nf1.num_layers == nf2.num_layers
for i in range(nf1.num_layers):
assert nf1.layer_size(i) == nf2.layer_size(i)
assert nf1.layers[i].data.keys() == nf2.layers[i].data.keys()
for k in nf1.layers[i].data:
assert F.allclose(nf1.layers[i].data[k], nf2.layers[i].data[k])
assert nf1.num_blocks == nf2.num_blocks
for i in range(nf1.num_blocks):
assert nf1.block_size(i) == nf2.block_size(i)
assert nf1.blocks[i].data.keys() == nf2.blocks[i].data.keys()
for k in nf1.blocks[i].data:
assert F.allclose(nf1.blocks[i].data[k], nf2.blocks[i].data[k])
def _global_message_func(nodes): def _global_message_func(nodes):
return {'x': nodes.data['x']} return {'x': nodes.data['x']}
...@@ -157,9 +179,19 @@ def test_pickling_graph(): ...@@ -157,9 +179,19 @@ def test_pickling_graph():
new_g = _reconstruct_pickle(g) new_g = _reconstruct_pickle(g)
_assert_is_identical(g, new_g) _assert_is_identical(g, new_g)
def test_pickling_nodeflow():
elist = [(0, 1), (1, 2), (2, 3), (3, 0)]
g = dgl.DGLGraph(elist, readonly=True)
g.ndata['x'] = F.randn((4, 5))
g.edata['y'] = F.randn((4, 3))
nf = contrib.sampling.sampler.create_full_nodeflow(g, 5)
nf.copy_from_parent() # add features
new_nf = _reconstruct_pickle(nf)
_assert_is_identical_nodeflow(nf, new_nf)
if __name__ == '__main__': if __name__ == '__main__':
test_pickling_index() test_pickling_index()
test_pickling_graph_index() test_pickling_graph_index()
test_pickling_frame() test_pickling_frame()
test_pickling_graph() test_pickling_graph()
test_pickling_nodeflow()
...@@ -8,11 +8,17 @@ def generate_rand_graph(n): ...@@ -8,11 +8,17 @@ def generate_rand_graph(n):
arr = (sp.sparse.random(n, n, density=0.1, format='coo') != 0).astype(np.int64) arr = (sp.sparse.random(n, n, density=0.1, format='coo') != 0).astype(np.int64)
return dgl.DGLGraph(arr, readonly=True) return dgl.DGLGraph(arr, readonly=True)
def test_create_full():
g = generate_rand_graph(100)
full_nf = dgl.contrib.sampling.sampler.create_full_nodeflow(g, 5)
assert full_nf.number_of_nodes() == 600
assert full_nf.number_of_edges() == g.number_of_edges() * 5
def test_1neighbor_sampler_all(): def test_1neighbor_sampler_all():
g = generate_rand_graph(100) g = generate_rand_graph(100)
# In this case, NeighborSampling simply gets the neighborhood of a single vertex. # In this case, NeighborSampling simply gets the neighborhood of a single vertex.
for subg in dgl.contrib.sampling.NeighborSampler(g, 1, 100, neighbor_type='in', for i, subg in enumerate(dgl.contrib.sampling.NeighborSampler(
num_workers=4): g, 1, 100, neighbor_type='in', num_workers=4)):
seed_ids = subg.layer_parent_nid(-1) seed_ids = subg.layer_parent_nid(-1)
assert len(seed_ids) == 1 assert len(seed_ids) == 1
src, dst, eid = g.in_edges(seed_ids, form='all') src, dst, eid = g.in_edges(seed_ids, form='all')
...@@ -164,6 +170,7 @@ def test_random_walk(): ...@@ -164,6 +170,7 @@ def test_random_walk():
assert (np.abs(trace_diff) == 1).all() assert (np.abs(trace_diff) == 1).all()
if __name__ == '__main__': if __name__ == '__main__':
test_create_full()
test_1neighbor_sampler_all() test_1neighbor_sampler_all()
test_10neighbor_sampler_all() test_10neighbor_sampler_all()
test_1neighbor_sampler() test_1neighbor_sampler()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment