Unverified Commit f5eb80d2 authored by Quan (Andy) Gan's avatar Quan (Andy) Gan Committed by GitHub
Browse files

[Feature] Edge DataLoader for edge classification & link prediction (#1828)

* clean commit

* oops forgot the most important files

* use einsum

* copy feature from frontier to block

* Revert "copy feature from frontier to block"

This reverts commit 5224ec963eb6a3ef1b6ab74d8ecbd44e4e42f285.

* temp fix

* unit test

* fix

* revert jtnn

* lint

* fix win64

* docstring fixes and doc indexing

* revert einsum in sparse bidecoder

* fix some examples

* lint

* fix due to some tediousness in remove_edges

* addresses comments

* fix

* more jtnn fixes

* fix
parent d340ea3a
...@@ -334,6 +334,7 @@ def add_reverse_edges(g, readonly=None, copy_ndata=True, ...@@ -334,6 +334,7 @@ def add_reverse_edges(g, readonly=None, copy_ndata=True,
num_nodes_dict[ntype] = g.number_of_nodes(ntype) num_nodes_dict[ntype] = g.number_of_nodes(ntype)
canonical_etypes = g.canonical_etypes canonical_etypes = g.canonical_etypes
num_nodes_dict = {ntype: g.number_of_nodes(ntype) for ntype in g.ntypes}
# fast path # fast path
if ignore_bipartite is False: if ignore_bipartite is False:
subgs = {} subgs = {}
...@@ -403,10 +404,15 @@ def line_graph(g, backtracking=True, shared=False): ...@@ -403,10 +404,15 @@ def line_graph(g, backtracking=True, shared=False):
G : DGLHeteroGraph G : DGLHeteroGraph
The line graph of this graph. The line graph of this graph.
Examples: Notes
A = [[0, 0, 1], -----
[1, 0, 1], The implementation is done on CPU, even if the input and output graphs are on GPU.
[1, 1, 0]]
Examples
--------
>>> A = [[0, 0, 1],
... [1, 0, 1],
... [1, 1, 0]]
>>> g = dgl.graph(([0, 1, 1, 2, 2],[2, 0, 2, 0, 1]), 'user', 'follows') >>> g = dgl.graph(([0, 1, 1, 2, 2],[2, 0, 2, 0, 1]), 'user', 'follows')
>>> lg = g.line_graph() >>> lg = g.line_graph()
>>> lg >>> lg
...@@ -427,7 +433,10 @@ def line_graph(g, backtracking=True, shared=False): ...@@ -427,7 +433,10 @@ def line_graph(g, backtracking=True, shared=False):
""" """
assert g.is_homogeneous(), \ assert g.is_homogeneous(), \
'line_heterograph only support directed homogeneous graph right now' 'line_heterograph only support directed homogeneous graph right now'
lg = DGLHeteroGraph(_CAPI_DGLHeteroLineGraph(g._graph, backtracking))
dev = g.device
lg = DGLHeteroGraph(_CAPI_DGLHeteroLineGraph(g._graph.copy_to(nd.cpu()), backtracking))
lg = lg.to(dev)
if shared: if shared:
# copy edge features # copy edge features
lg.ndata.update(g.edata) lg.ndata.update(g.edata)
...@@ -453,7 +462,6 @@ def khop_adj(g, k): ...@@ -453,7 +462,6 @@ def khop_adj(g, k):
Examples Examples
-------- --------
>>> import dgl >>> import dgl
>>> g = dgl.DGLGraph() >>> g = dgl.DGLGraph()
>>> g.add_nodes(5) >>> g.add_nodes(5)
...@@ -863,6 +871,7 @@ def add_nodes(g, num, data=None, ntype=None): ...@@ -863,6 +871,7 @@ def add_nodes(g, num, data=None, ntype=None):
-------- --------
The following example uses PyTorch backend. The following example uses PyTorch backend.
>>> import dgl >>> import dgl
>>> import torch >>> import torch
...@@ -965,6 +974,7 @@ def add_edges(g, u, v, data=None, etype=None): ...@@ -965,6 +974,7 @@ def add_edges(g, u, v, data=None, etype=None):
-------- --------
The following example uses PyTorch backend. The following example uses PyTorch backend.
>>> import dgl >>> import dgl
>>> import torch >>> import torch
**Homogeneous Graphs or Heterogeneous Graphs with A Single Edge Type** **Homogeneous Graphs or Heterogeneous Graphs with A Single Edge Type**
...@@ -1538,10 +1548,12 @@ def to_block(g, dst_nodes=None, include_dst_in_src=True, copy_ndata=True, copy_e ...@@ -1538,10 +1548,12 @@ def to_block(g, dst_nodes=None, include_dst_in_src=True, copy_ndata=True, copy_e
Examples Examples
-------- --------
Converting a homogeneous graph to a block as described above: Converting a homogeneous graph to a block as described above:
>>> g = dgl.graph([(0, 1), (1, 2), (2, 3)]) >>> g = dgl.graph([(0, 1), (1, 2), (2, 3)])
>>> block = dgl.to_block(g, torch.LongTensor([3, 2])) >>> block = dgl.to_block(g, torch.LongTensor([3, 2]))
The right hand side nodes would be exactly the same as the ones given: [3, 2]. The right hand side nodes would be exactly the same as the ones given: [3, 2].
>>> induced_dst = block.dstdata[dgl.NID] >>> induced_dst = block.dstdata[dgl.NID]
>>> induced_dst >>> induced_dst
tensor([3, 2]) tensor([3, 2])
...@@ -1549,6 +1561,7 @@ def to_block(g, dst_nodes=None, include_dst_in_src=True, copy_ndata=True, copy_e ...@@ -1549,6 +1561,7 @@ def to_block(g, dst_nodes=None, include_dst_in_src=True, copy_ndata=True, copy_e
The first few nodes of the left hand side nodes would also be exactly the same as The first few nodes of the left hand side nodes would also be exactly the same as
the ones given. The rest of the nodes are the ones necessary for message passing the ones given. The rest of the nodes are the ones necessary for message passing
into nodes 3, 2. This means that the node 1 would be included. into nodes 3, 2. This means that the node 1 would be included.
>>> induced_src = block.srcdata[dgl.NID] >>> induced_src = block.srcdata[dgl.NID]
>>> induced_src >>> induced_src
tensor([3, 2, 1]) tensor([3, 2, 1])
...@@ -1557,22 +1570,26 @@ def to_block(g, dst_nodes=None, include_dst_in_src=True, copy_ndata=True, copy_e ...@@ -1557,22 +1570,26 @@ def to_block(g, dst_nodes=None, include_dst_in_src=True, copy_ndata=True, copy_e
the right hand side nodes. the right hand side nodes.
The induced edges can also be obtained by the following: The induced edges can also be obtained by the following:
>>> block.edata[dgl.EID] >>> block.edata[dgl.EID]
tensor([2, 1]) tensor([2, 1])
This indicates that edge (2, 3) and (1, 2) are included in the result graph. We can This indicates that edge (2, 3) and (1, 2) are included in the result graph. We can
verify that the first edge in the block indeed maps to the edge (2, 3), and the verify that the first edge in the block indeed maps to the edge (2, 3), and the
second edge in the block indeed maps to the edge (1, 2): second edge in the block indeed maps to the edge (1, 2):
>>> src, dst = block.edges(order='eid') >>> src, dst = block.edges(order='eid')
>>> induced_src[src], induced_dst[dst] >>> induced_src[src], induced_dst[dst]
(tensor([2, 1]), tensor([3, 2])) (tensor([2, 1]), tensor([3, 2]))
Converting a heterogeneous graph to a block is similar, except that when specifying Converting a heterogeneous graph to a block is similar, except that when specifying
the right hand side nodes, you have to give a dict: the right hand side nodes, you have to give a dict:
>>> g = dgl.bipartite([(0, 1), (1, 2), (2, 3)], utype='A', vtype='B') >>> g = dgl.bipartite([(0, 1), (1, 2), (2, 3)], utype='A', vtype='B')
If you don't specify any node of type A on the right hand side, the node type ``A`` If you don't specify any node of type A on the right hand side, the node type ``A``
in the block would have zero nodes on the DST side. in the block would have zero nodes on the DST side.
>>> block = dgl.to_block(g, {'B': torch.LongTensor([3, 2])}) >>> block = dgl.to_block(g, {'B': torch.LongTensor([3, 2])})
>>> block.number_of_dst_nodes('A') >>> block.number_of_dst_nodes('A')
0 0
...@@ -1582,10 +1599,12 @@ def to_block(g, dst_nodes=None, include_dst_in_src=True, copy_ndata=True, copy_e ...@@ -1582,10 +1599,12 @@ def to_block(g, dst_nodes=None, include_dst_in_src=True, copy_ndata=True, copy_e
tensor([3, 2]) tensor([3, 2])
The left hand side would contain all the nodes on the right hand side: The left hand side would contain all the nodes on the right hand side:
>>> block.srcnodes['B'].data[dgl.NID] >>> block.srcnodes['B'].data[dgl.NID]
tensor([3, 2]) tensor([3, 2])
As well as all the nodes that have connections to the nodes on the right hand side: As well as all the nodes that have connections to the nodes on the right hand side:
>>> block.srcnodes['A'].data[dgl.NID] >>> block.srcnodes['A'].data[dgl.NID]
tensor([2, 1]) tensor([2, 1])
""" """
......
...@@ -667,6 +667,11 @@ class FlattenedDict(object): ...@@ -667,6 +667,11 @@ class FlattenedDict(object):
group_sizes = {k: len(v) for k, v in groups.items()} group_sizes = {k: len(v) for k, v in groups.items()}
self._group_keys, self._group_sizes = zip(*group_sizes.items()) self._group_keys, self._group_sizes = zip(*group_sizes.items())
self._group_offsets = np.insert(np.cumsum(self._group_sizes), 0, 0) self._group_offsets = np.insert(np.cumsum(self._group_sizes), 0, 0)
# TODO: this is faster (37s -> 21s per epoch compared to searchsorted in GCMC) but takes
# O(E) memory.
self._idx_to_group = np.zeros(self._group_offsets[-1], dtype='int32')
for i in range(len(self._groups)):
self._idx_to_group[self._group_offsets[i]:self._group_offsets[i + 1]] = i
def __len__(self): def __len__(self):
"""Return the total number of items.""" """Return the total number of items."""
...@@ -680,10 +685,11 @@ class FlattenedDict(object): ...@@ -680,10 +685,11 @@ class FlattenedDict(object):
def __getitem__(self, idx): def __getitem__(self, idx):
"""Return the item at the given position with the key of its original group.""" """Return the item at the given position with the key of its original group."""
i = np.searchsorted(self._group_offsets, idx, 'right') - 1 i = self._idx_to_group[idx]
k = self._group_keys[i] k = self._group_keys[i]
j = idx - self._group_offsets[i] j = idx - self._group_offsets[i]
return k, self._groups[k][j] g = self._groups[k]
return k, g[j]
def compensate(ids, origin_ids): def compensate(ids, origin_ids):
"""computing the compensate set of ids from origin_ids """computing the compensate set of ids from origin_ids
......
...@@ -15,17 +15,16 @@ D = 5 ...@@ -15,17 +15,16 @@ D = 5
# line graph related # line graph related
@unittest.skipIf(F._default_context_str == 'gpu', reason="GPU not implemented")
def test_line_graph1(): def test_line_graph1():
N = 5 N = 5
G = dgl.DGLGraph(nx.star_graph(N)) G = dgl.DGLGraph(nx.star_graph(N)).to(F.ctx())
G.edata['h'] = F.randn((2 * N, D)) G.edata['h'] = F.randn((2 * N, D))
n_edges = G.number_of_edges() n_edges = G.number_of_edges()
L = G.line_graph(shared=True) L = G.line_graph(shared=True)
assert L.number_of_nodes() == 2 * N assert L.number_of_nodes() == 2 * N
assert F.allclose(L.ndata['h'], G.edata['h']) assert F.allclose(L.ndata['h'], G.edata['h'])
assert G.device == F.ctx()
@unittest.skipIf(F._default_context_str == 'gpu', reason="GPU not implemented")
@parametrize_dtype @parametrize_dtype
def test_line_graph2(idtype): def test_line_graph2(idtype):
g = dgl.graph(([0, 1, 1, 2, 2],[2, 0, 2, 0, 1]), g = dgl.graph(([0, 1, 1, 2, 2],[2, 0, 2, 0, 1]),
...@@ -73,7 +72,6 @@ def test_line_graph2(idtype): ...@@ -73,7 +72,6 @@ def test_line_graph2(idtype):
assert np.array_equal(col[order], assert np.array_equal(col[order],
np.array([3, 4, 0, 3, 4, 0, 1, 2])) np.array([3, 4, 0, 3, 4, 0, 1, 2]))
@unittest.skipIf(F._default_context_str == 'gpu', reason="GPU not implemented")
def test_no_backtracking(): def test_no_backtracking():
N = 5 N = 5
G = dgl.DGLGraph(nx.star_graph(N)) G = dgl.DGLGraph(nx.star_graph(N))
......
...@@ -4,11 +4,23 @@ import numpy as np ...@@ -4,11 +4,23 @@ import numpy as np
import unittest import unittest
from torch.utils.data import DataLoader from torch.utils.data import DataLoader
from collections import defaultdict from collections import defaultdict
from itertools import product
def _check_neighbor_sampling_dataloader(g, nids, dl): def _check_neighbor_sampling_dataloader(g, nids, dl, mode):
seeds = defaultdict(list) seeds = defaultdict(list)
for input_nodes, output_nodes, blocks in dl: for item in dl:
if mode == 'node':
input_nodes, output_nodes, blocks = item
elif mode == 'edge':
input_nodes, pair_graph, blocks = item
output_nodes = pair_graph.ndata[dgl.NID]
elif mode == 'link':
input_nodes, pair_graph, neg_graph, blocks = item
output_nodes = pair_graph.ndata[dgl.NID]
for ntype in pair_graph.ntypes:
assert F.array_equal(pair_graph.nodes[ntype].data[dgl.NID], neg_graph.nodes[ntype].data[dgl.NID])
if len(g.ntypes) > 1: if len(g.ntypes) > 1:
for ntype in g.ntypes: for ntype in g.ntypes:
assert F.array_equal(input_nodes[ntype], blocks[0].srcnodes[ntype].data[dgl.NID]) assert F.array_equal(input_nodes[ntype], blocks[0].srcnodes[ntype].data[dgl.NID])
...@@ -37,60 +49,123 @@ def _check_neighbor_sampling_dataloader(g, nids, dl): ...@@ -37,60 +49,123 @@ def _check_neighbor_sampling_dataloader(g, nids, dl):
dst = block.dstnodes[ntype].data[dgl.NID] dst = block.dstnodes[ntype].data[dgl.NID]
assert F.array_equal(src[:block.number_of_dst_nodes(ntype)], dst) assert F.array_equal(src[:block.number_of_dst_nodes(ntype)], dst)
prev_dst[ntype] = dst prev_dst[ntype] = dst
for ntype in blocks[-1].dsttypes:
seeds[ntype].append(blocks[-1].dstnodes[ntype].data[dgl.NID])
# Check if all nodes are iterated if mode == 'node':
for ntype in blocks[-1].dsttypes:
seeds[ntype].append(blocks[-1].dstnodes[ntype].data[dgl.NID])
elif mode == 'edge' or mode == 'link':
for etype in pair_graph.canonical_etypes:
seeds[etype].append(pair_graph.edges[etype].data[dgl.EID])
# Check if all nodes/edges are iterated
seeds = {k: F.cat(v, 0) for k, v in seeds.items()} seeds = {k: F.cat(v, 0) for k, v in seeds.items()}
for k, v in seeds.items(): for k, v in seeds.items():
if k in nids:
seed_set = set(F.asnumpy(nids[k]))
elif isinstance(k, tuple) and k[1] in nids:
seed_set = set(F.asnumpy(nids[k[1]]))
else:
continue
v_set = set(F.asnumpy(v)) v_set = set(F.asnumpy(v))
seed_set = set(nids[k])
assert v_set == seed_set assert v_set == seed_set
@unittest.skipIf(F._default_context_str == 'gpu', reason="GPU sample neighbors not implemented") @unittest.skipIf(F._default_context_str == 'gpu', reason="GPU sample neighbors not implemented")
def test_neighbor_sampler_dataloader(): def test_neighbor_sampler_dataloader():
g = dgl.graph([(0,1),(0,2),(0,3),(1,0),(1,2),(1,3),(2,0)], g = dgl.graph([(0,1),(0,2),(0,3),(1,3),(1,4)],
'user', 'follow', num_nodes=6) 'user', 'follow', num_nodes=6).long()
g_sampler1 = dgl.sampling.MultiLayerNeighborSampler([2, 2], return_eids=True) g = dgl.to_bidirected(g)
g_sampler2 = dgl.sampling.MultiLayerNeighborSampler([None, None], return_eids=True) reverse_eids = F.tensor([5, 6, 7, 8, 9, 0, 1, 2, 3, 4], dtype=F.int64)
g_sampler1 = dgl.dataloading.MultiLayerNeighborSampler([2, 2], return_eids=True)
g_sampler2 = dgl.dataloading.MultiLayerFullNeighborSampler(2, return_eids=True)
hg = dgl.heterograph({ hg = dgl.heterograph({
('user', 'follow', 'user'): [(0, 1), (0, 2), (0, 3), (1, 0), (1, 2), (1, 3), (2, 0)], ('user', 'follow', 'user'): [(0, 1), (0, 2), (0, 3), (1, 0), (1, 2), (1, 3), (2, 0)],
('user', 'plays', 'game'): [(0, 0), (1, 1), (1, 2), (3, 0), (5, 2)], ('user', 'followed-by', 'user'): [(1, 0), (2, 0), (3, 0), (0, 1), (2, 1), (3, 1), (0, 2)],
('game', 'wanted-by', 'user'): [(0, 1), (2, 1), (1, 3), (2, 3), (2, 5)]}) ('user', 'play', 'game'): [(0, 0), (1, 1), (1, 2), (3, 0), (5, 2)],
hg_sampler1 = dgl.sampling.MultiLayerNeighborSampler( ('game', 'played-by', 'user'): [(0, 0), (1, 1), (2, 1), (0, 3), (2, 5)]}).long()
[{'plays': 1, 'wanted-by': 1, 'follow': 2}] * 2, hg_sampler1 = dgl.dataloading.MultiLayerNeighborSampler(
return_eids=True) [{'play': 1, 'played-by': 1, 'follow': 2, 'followed-by': 1}] * 2, return_eids=True)
hg_sampler2 = dgl.sampling.MultiLayerNeighborSampler([None, None], return_eids=True) hg_sampler2 = dgl.dataloading.MultiLayerFullNeighborSampler(2, return_eids=True)
reverse_etypes = {'follow': 'followed-by', 'followed-by': 'follow', 'play': 'played-by', 'played-by': 'play'}
collators = [
dgl.sampling.NodeCollator(g, [0, 1, 2, 3, 5], g_sampler1), collators = []
dgl.sampling.NodeCollator(g, [4, 5], g_sampler1), graphs = []
dgl.sampling.NodeCollator(g, [0, 1, 2, 3, 5], g_sampler2), nids = []
dgl.sampling.NodeCollator(g, [4, 5], g_sampler2), modes = []
dgl.sampling.NodeCollator(hg, {'user': [0, 1, 3, 5], 'game': [0, 1, 2]}, hg_sampler1), for seeds, sampler in product(
dgl.sampling.NodeCollator(hg, {'user': [4, 5], 'game': [0, 1, 2]}, hg_sampler1), [F.tensor([0, 1, 2, 3, 5], dtype=F.int64), F.tensor([4, 5], dtype=F.int64)],
dgl.sampling.NodeCollator(hg, {'user': [0, 1, 3, 5], 'game': [0, 1, 2]}, hg_sampler2), [g_sampler1, g_sampler2]):
dgl.sampling.NodeCollator(hg, {'user': [4, 5], 'game': [0, 1, 2]}, hg_sampler2)] collators.append(dgl.dataloading.NodeCollator(g, seeds, sampler))
nids = [ graphs.append(g)
{'user': [0, 1, 2, 3, 5]}, nids.append({'user': seeds})
{'user': [4, 5]}, modes.append('node')
{'user': [0, 1, 2, 3, 5]},
{'user': [4, 5]}, collators.append(dgl.dataloading.EdgeCollator(g, seeds, sampler))
{'user': [0, 1, 3, 5], 'game': [0, 1, 2]}, graphs.append(g)
{'user': [4, 5], 'game': [0, 1, 2]}, nids.append({'follow': seeds})
{'user': [0, 1, 3, 5], 'game': [0, 1, 2]}, modes.append('edge')
{'user': [4, 5], 'game': [0, 1, 2]}]
graphs = [g] * 4 + [hg] * 4 collators.append(dgl.dataloading.EdgeCollator(
samplers = [g_sampler1, g_sampler1, g_sampler2, g_sampler2, hg_sampler1, hg_sampler1, hg_sampler2, hg_sampler2] g, seeds, sampler, exclude='reverse_id', reverse_eids=reverse_eids))
graphs.append(g)
for _g, nid, collator in zip(graphs, nids, collators): nids.append({'follow': seeds})
modes.append('edge')
collators.append(dgl.dataloading.EdgeCollator(
g, seeds, sampler, negative_sampler=dgl.dataloading.negative_sampler.Uniform(2)))
graphs.append(g)
nids.append({'follow': seeds})
modes.append('link')
collators.append(dgl.dataloading.EdgeCollator(
g, seeds, sampler, exclude='reverse_id', reverse_eids=reverse_eids,
negative_sampler=dgl.dataloading.negative_sampler.Uniform(2)))
graphs.append(g)
nids.append({'follow': seeds})
modes.append('link')
for seeds, sampler in product(
[{'user': F.tensor([0, 1, 3, 5], dtype=F.int64), 'game': F.tensor([0, 1, 2], dtype=F.int64)},
{'user': F.tensor([4, 5], dtype=F.int64), 'game': F.tensor([0, 1, 2], dtype=F.int64)}],
[hg_sampler1, hg_sampler2]):
collators.append(dgl.dataloading.NodeCollator(hg, seeds, sampler))
graphs.append(hg)
nids.append(seeds)
modes.append('node')
for seeds, sampler in product(
[{'follow': F.tensor([0, 1, 3, 5], dtype=F.int64), 'play': F.tensor([1, 3], dtype=F.int64)},
{'follow': F.tensor([4, 5], dtype=F.int64), 'play': F.tensor([1, 3], dtype=F.int64)}],
[hg_sampler1, hg_sampler2]):
collators.append(dgl.dataloading.EdgeCollator(hg, seeds, sampler))
graphs.append(hg)
nids.append(seeds)
modes.append('edge')
collators.append(dgl.dataloading.EdgeCollator(
hg, seeds, sampler, exclude='reverse_types', reverse_etypes=reverse_etypes))
graphs.append(hg)
nids.append(seeds)
modes.append('edge')
collators.append(dgl.dataloading.EdgeCollator(
hg, seeds, sampler, negative_sampler=dgl.dataloading.negative_sampler.Uniform(2)))
graphs.append(hg)
nids.append(seeds)
modes.append('link')
collators.append(dgl.dataloading.EdgeCollator(
hg, seeds, sampler, exclude='reverse_types', reverse_etypes=reverse_etypes,
negative_sampler=dgl.dataloading.negative_sampler.Uniform(2)))
graphs.append(hg)
nids.append(seeds)
modes.append('link')
for _g, nid, collator, mode in zip(graphs, nids, collators, modes):
dl = DataLoader( dl = DataLoader(
collator.dataset, collate_fn=collator.collate, batch_size=2, shuffle=True, drop_last=False) collator.dataset, collate_fn=collator.collate, batch_size=2, shuffle=True, drop_last=False)
_check_neighbor_sampling_dataloader(_g, nid, dl) _check_neighbor_sampling_dataloader(_g, nid, dl, mode)
for _g, nid, sampler in zip(graphs, nids, samplers):
dl = dgl.sampling.NodeDataLoader(_g, nid, sampler, batch_size=2, shuffle=True, drop_last=False)
_check_neighbor_sampling_dataloader(_g, nid, dl)
if __name__ == '__main__': if __name__ == '__main__':
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment