"docs/vscode:/vscode.git/clone" did not exist on "a0cd61e8577307529cd1e0cb92741c13ababde2b"
Unverified Commit f5eb80d2 authored by Quan (Andy) Gan's avatar Quan (Andy) Gan Committed by GitHub
Browse files

[Feature] Edge DataLoader for edge classification & link prediction (#1828)

* clean commit

* oops forgot the most important files

* use einsum

* copy feature from frontier to block

* Revert "copy feature from frontier to block"

This reverts commit 5224ec963eb6a3ef1b6ab74d8ecbd44e4e42f285.

* temp fix

* unit test

* fix

* revert jtnn

* lint

* fix win64

* docstring fixes and doc indexing

* revert einsum in sparse bidecoder

* fix some examples

* lint

* fix due to some tediousness in remove_edges

* addresses comments

* fix

* more jtnn fixes

* fix
parent d340ea3a
......@@ -334,6 +334,7 @@ def add_reverse_edges(g, readonly=None, copy_ndata=True,
num_nodes_dict[ntype] = g.number_of_nodes(ntype)
canonical_etypes = g.canonical_etypes
num_nodes_dict = {ntype: g.number_of_nodes(ntype) for ntype in g.ntypes}
# fast path
if ignore_bipartite is False:
subgs = {}
......@@ -403,10 +404,15 @@ def line_graph(g, backtracking=True, shared=False):
G : DGLHeteroGraph
The line graph of this graph.
Examples:
A = [[0, 0, 1],
[1, 0, 1],
[1, 1, 0]]
Notes
-----
The implementation is done on CPU, even if the input and output graphs are on GPU.
Examples
--------
>>> A = [[0, 0, 1],
... [1, 0, 1],
... [1, 1, 0]]
>>> g = dgl.graph(([0, 1, 1, 2, 2],[2, 0, 2, 0, 1]), 'user', 'follows')
>>> lg = g.line_graph()
>>> lg
......@@ -427,7 +433,10 @@ def line_graph(g, backtracking=True, shared=False):
"""
assert g.is_homogeneous(), \
'line_heterograph only support directed homogeneous graph right now'
lg = DGLHeteroGraph(_CAPI_DGLHeteroLineGraph(g._graph, backtracking))
dev = g.device
lg = DGLHeteroGraph(_CAPI_DGLHeteroLineGraph(g._graph.copy_to(nd.cpu()), backtracking))
lg = lg.to(dev)
if shared:
# copy edge features
lg.ndata.update(g.edata)
......@@ -453,7 +462,6 @@ def khop_adj(g, k):
Examples
--------
>>> import dgl
>>> g = dgl.DGLGraph()
>>> g.add_nodes(5)
......@@ -863,6 +871,7 @@ def add_nodes(g, num, data=None, ntype=None):
--------
The following example uses PyTorch backend.
>>> import dgl
>>> import torch
......@@ -965,6 +974,7 @@ def add_edges(g, u, v, data=None, etype=None):
--------
The following example uses PyTorch backend.
>>> import dgl
>>> import torch
**Homogeneous Graphs or Heterogeneous Graphs with A Single Edge Type**
......@@ -1538,10 +1548,12 @@ def to_block(g, dst_nodes=None, include_dst_in_src=True, copy_ndata=True, copy_e
Examples
--------
Converting a homogeneous graph to a block as described above:
>>> g = dgl.graph([(0, 1), (1, 2), (2, 3)])
>>> block = dgl.to_block(g, torch.LongTensor([3, 2]))
The right hand side nodes would be exactly the same as the ones given: [3, 2].
>>> induced_dst = block.dstdata[dgl.NID]
>>> induced_dst
tensor([3, 2])
......@@ -1549,6 +1561,7 @@ def to_block(g, dst_nodes=None, include_dst_in_src=True, copy_ndata=True, copy_e
The first few nodes of the left hand side nodes would also be exactly the same as
the ones given. The rest of the nodes are the ones necessary for message passing
into nodes 3, 2. This means that the node 1 would be included.
>>> induced_src = block.srcdata[dgl.NID]
>>> induced_src
tensor([3, 2, 1])
......@@ -1557,22 +1570,26 @@ def to_block(g, dst_nodes=None, include_dst_in_src=True, copy_ndata=True, copy_e
the right hand side nodes.
The induced edges can also be obtained by the following:
>>> block.edata[dgl.EID]
tensor([2, 1])
This indicates that edge (2, 3) and (1, 2) are included in the result graph. We can
verify that the first edge in the block indeed maps to the edge (2, 3), and the
second edge in the block indeed maps to the edge (1, 2):
>>> src, dst = block.edges(order='eid')
>>> induced_src[src], induced_dst[dst]
(tensor([2, 1]), tensor([3, 2]))
Converting a heterogeneous graph to a block is similar, except that when specifying
the right hand side nodes, you have to give a dict:
>>> g = dgl.bipartite([(0, 1), (1, 2), (2, 3)], utype='A', vtype='B')
If you don't specify any node of type A on the right hand side, the node type ``A``
in the block would have zero nodes on the DST side.
>>> block = dgl.to_block(g, {'B': torch.LongTensor([3, 2])})
>>> block.number_of_dst_nodes('A')
0
......@@ -1582,10 +1599,12 @@ def to_block(g, dst_nodes=None, include_dst_in_src=True, copy_ndata=True, copy_e
tensor([3, 2])
The left hand side would contain all the nodes on the right hand side:
>>> block.srcnodes['B'].data[dgl.NID]
tensor([3, 2])
As well as all the nodes that have connections to the nodes on the right hand side:
>>> block.srcnodes['A'].data[dgl.NID]
tensor([2, 1])
"""
......
......@@ -667,6 +667,11 @@ class FlattenedDict(object):
group_sizes = {k: len(v) for k, v in groups.items()}
self._group_keys, self._group_sizes = zip(*group_sizes.items())
self._group_offsets = np.insert(np.cumsum(self._group_sizes), 0, 0)
# TODO: this is faster (37s -> 21s per epoch compared to searchsorted in GCMC) but takes
# O(E) memory.
self._idx_to_group = np.zeros(self._group_offsets[-1], dtype='int32')
for i in range(len(self._groups)):
self._idx_to_group[self._group_offsets[i]:self._group_offsets[i + 1]] = i
def __len__(self):
"""Return the total number of items."""
......@@ -680,10 +685,11 @@ class FlattenedDict(object):
def __getitem__(self, idx):
"""Return the item at the given position with the key of its original group."""
i = np.searchsorted(self._group_offsets, idx, 'right') - 1
i = self._idx_to_group[idx]
k = self._group_keys[i]
j = idx - self._group_offsets[i]
return k, self._groups[k][j]
g = self._groups[k]
return k, g[j]
def compensate(ids, origin_ids):
"""computing the compensate set of ids from origin_ids
......
......@@ -15,17 +15,16 @@ D = 5
# line graph related
@unittest.skipIf(F._default_context_str == 'gpu', reason="GPU not implemented")
def test_line_graph1():
N = 5
G = dgl.DGLGraph(nx.star_graph(N))
G = dgl.DGLGraph(nx.star_graph(N)).to(F.ctx())
G.edata['h'] = F.randn((2 * N, D))
n_edges = G.number_of_edges()
L = G.line_graph(shared=True)
assert L.number_of_nodes() == 2 * N
assert F.allclose(L.ndata['h'], G.edata['h'])
assert G.device == F.ctx()
@unittest.skipIf(F._default_context_str == 'gpu', reason="GPU not implemented")
@parametrize_dtype
def test_line_graph2(idtype):
g = dgl.graph(([0, 1, 1, 2, 2],[2, 0, 2, 0, 1]),
......@@ -73,7 +72,6 @@ def test_line_graph2(idtype):
assert np.array_equal(col[order],
np.array([3, 4, 0, 3, 4, 0, 1, 2]))
@unittest.skipIf(F._default_context_str == 'gpu', reason="GPU not implemented")
def test_no_backtracking():
N = 5
G = dgl.DGLGraph(nx.star_graph(N))
......
......@@ -4,11 +4,23 @@ import numpy as np
import unittest
from torch.utils.data import DataLoader
from collections import defaultdict
from itertools import product
def _check_neighbor_sampling_dataloader(g, nids, dl):
def _check_neighbor_sampling_dataloader(g, nids, dl, mode):
seeds = defaultdict(list)
for input_nodes, output_nodes, blocks in dl:
for item in dl:
if mode == 'node':
input_nodes, output_nodes, blocks = item
elif mode == 'edge':
input_nodes, pair_graph, blocks = item
output_nodes = pair_graph.ndata[dgl.NID]
elif mode == 'link':
input_nodes, pair_graph, neg_graph, blocks = item
output_nodes = pair_graph.ndata[dgl.NID]
for ntype in pair_graph.ntypes:
assert F.array_equal(pair_graph.nodes[ntype].data[dgl.NID], neg_graph.nodes[ntype].data[dgl.NID])
if len(g.ntypes) > 1:
for ntype in g.ntypes:
assert F.array_equal(input_nodes[ntype], blocks[0].srcnodes[ntype].data[dgl.NID])
......@@ -37,60 +49,123 @@ def _check_neighbor_sampling_dataloader(g, nids, dl):
dst = block.dstnodes[ntype].data[dgl.NID]
assert F.array_equal(src[:block.number_of_dst_nodes(ntype)], dst)
prev_dst[ntype] = dst
for ntype in blocks[-1].dsttypes:
seeds[ntype].append(blocks[-1].dstnodes[ntype].data[dgl.NID])
# Check if all nodes are iterated
if mode == 'node':
for ntype in blocks[-1].dsttypes:
seeds[ntype].append(blocks[-1].dstnodes[ntype].data[dgl.NID])
elif mode == 'edge' or mode == 'link':
for etype in pair_graph.canonical_etypes:
seeds[etype].append(pair_graph.edges[etype].data[dgl.EID])
# Check if all nodes/edges are iterated
seeds = {k: F.cat(v, 0) for k, v in seeds.items()}
for k, v in seeds.items():
if k in nids:
seed_set = set(F.asnumpy(nids[k]))
elif isinstance(k, tuple) and k[1] in nids:
seed_set = set(F.asnumpy(nids[k[1]]))
else:
continue
v_set = set(F.asnumpy(v))
seed_set = set(nids[k])
assert v_set == seed_set
@unittest.skipIf(F._default_context_str == 'gpu', reason="GPU sample neighbors not implemented")
def test_neighbor_sampler_dataloader():
g = dgl.graph([(0,1),(0,2),(0,3),(1,0),(1,2),(1,3),(2,0)],
'user', 'follow', num_nodes=6)
g_sampler1 = dgl.sampling.MultiLayerNeighborSampler([2, 2], return_eids=True)
g_sampler2 = dgl.sampling.MultiLayerNeighborSampler([None, None], return_eids=True)
g = dgl.graph([(0,1),(0,2),(0,3),(1,3),(1,4)],
'user', 'follow', num_nodes=6).long()
g = dgl.to_bidirected(g)
reverse_eids = F.tensor([5, 6, 7, 8, 9, 0, 1, 2, 3, 4], dtype=F.int64)
g_sampler1 = dgl.dataloading.MultiLayerNeighborSampler([2, 2], return_eids=True)
g_sampler2 = dgl.dataloading.MultiLayerFullNeighborSampler(2, return_eids=True)
hg = dgl.heterograph({
('user', 'follow', 'user'): [(0, 1), (0, 2), (0, 3), (1, 0), (1, 2), (1, 3), (2, 0)],
('user', 'plays', 'game'): [(0, 0), (1, 1), (1, 2), (3, 0), (5, 2)],
('game', 'wanted-by', 'user'): [(0, 1), (2, 1), (1, 3), (2, 3), (2, 5)]})
hg_sampler1 = dgl.sampling.MultiLayerNeighborSampler(
[{'plays': 1, 'wanted-by': 1, 'follow': 2}] * 2,
return_eids=True)
hg_sampler2 = dgl.sampling.MultiLayerNeighborSampler([None, None], return_eids=True)
collators = [
dgl.sampling.NodeCollator(g, [0, 1, 2, 3, 5], g_sampler1),
dgl.sampling.NodeCollator(g, [4, 5], g_sampler1),
dgl.sampling.NodeCollator(g, [0, 1, 2, 3, 5], g_sampler2),
dgl.sampling.NodeCollator(g, [4, 5], g_sampler2),
dgl.sampling.NodeCollator(hg, {'user': [0, 1, 3, 5], 'game': [0, 1, 2]}, hg_sampler1),
dgl.sampling.NodeCollator(hg, {'user': [4, 5], 'game': [0, 1, 2]}, hg_sampler1),
dgl.sampling.NodeCollator(hg, {'user': [0, 1, 3, 5], 'game': [0, 1, 2]}, hg_sampler2),
dgl.sampling.NodeCollator(hg, {'user': [4, 5], 'game': [0, 1, 2]}, hg_sampler2)]
nids = [
{'user': [0, 1, 2, 3, 5]},
{'user': [4, 5]},
{'user': [0, 1, 2, 3, 5]},
{'user': [4, 5]},
{'user': [0, 1, 3, 5], 'game': [0, 1, 2]},
{'user': [4, 5], 'game': [0, 1, 2]},
{'user': [0, 1, 3, 5], 'game': [0, 1, 2]},
{'user': [4, 5], 'game': [0, 1, 2]}]
graphs = [g] * 4 + [hg] * 4
samplers = [g_sampler1, g_sampler1, g_sampler2, g_sampler2, hg_sampler1, hg_sampler1, hg_sampler2, hg_sampler2]
for _g, nid, collator in zip(graphs, nids, collators):
('user', 'followed-by', 'user'): [(1, 0), (2, 0), (3, 0), (0, 1), (2, 1), (3, 1), (0, 2)],
('user', 'play', 'game'): [(0, 0), (1, 1), (1, 2), (3, 0), (5, 2)],
('game', 'played-by', 'user'): [(0, 0), (1, 1), (2, 1), (0, 3), (2, 5)]}).long()
hg_sampler1 = dgl.dataloading.MultiLayerNeighborSampler(
[{'play': 1, 'played-by': 1, 'follow': 2, 'followed-by': 1}] * 2, return_eids=True)
hg_sampler2 = dgl.dataloading.MultiLayerFullNeighborSampler(2, return_eids=True)
reverse_etypes = {'follow': 'followed-by', 'followed-by': 'follow', 'play': 'played-by', 'played-by': 'play'}
collators = []
graphs = []
nids = []
modes = []
for seeds, sampler in product(
[F.tensor([0, 1, 2, 3, 5], dtype=F.int64), F.tensor([4, 5], dtype=F.int64)],
[g_sampler1, g_sampler2]):
collators.append(dgl.dataloading.NodeCollator(g, seeds, sampler))
graphs.append(g)
nids.append({'user': seeds})
modes.append('node')
collators.append(dgl.dataloading.EdgeCollator(g, seeds, sampler))
graphs.append(g)
nids.append({'follow': seeds})
modes.append('edge')
collators.append(dgl.dataloading.EdgeCollator(
g, seeds, sampler, exclude='reverse_id', reverse_eids=reverse_eids))
graphs.append(g)
nids.append({'follow': seeds})
modes.append('edge')
collators.append(dgl.dataloading.EdgeCollator(
g, seeds, sampler, negative_sampler=dgl.dataloading.negative_sampler.Uniform(2)))
graphs.append(g)
nids.append({'follow': seeds})
modes.append('link')
collators.append(dgl.dataloading.EdgeCollator(
g, seeds, sampler, exclude='reverse_id', reverse_eids=reverse_eids,
negative_sampler=dgl.dataloading.negative_sampler.Uniform(2)))
graphs.append(g)
nids.append({'follow': seeds})
modes.append('link')
for seeds, sampler in product(
[{'user': F.tensor([0, 1, 3, 5], dtype=F.int64), 'game': F.tensor([0, 1, 2], dtype=F.int64)},
{'user': F.tensor([4, 5], dtype=F.int64), 'game': F.tensor([0, 1, 2], dtype=F.int64)}],
[hg_sampler1, hg_sampler2]):
collators.append(dgl.dataloading.NodeCollator(hg, seeds, sampler))
graphs.append(hg)
nids.append(seeds)
modes.append('node')
for seeds, sampler in product(
[{'follow': F.tensor([0, 1, 3, 5], dtype=F.int64), 'play': F.tensor([1, 3], dtype=F.int64)},
{'follow': F.tensor([4, 5], dtype=F.int64), 'play': F.tensor([1, 3], dtype=F.int64)}],
[hg_sampler1, hg_sampler2]):
collators.append(dgl.dataloading.EdgeCollator(hg, seeds, sampler))
graphs.append(hg)
nids.append(seeds)
modes.append('edge')
collators.append(dgl.dataloading.EdgeCollator(
hg, seeds, sampler, exclude='reverse_types', reverse_etypes=reverse_etypes))
graphs.append(hg)
nids.append(seeds)
modes.append('edge')
collators.append(dgl.dataloading.EdgeCollator(
hg, seeds, sampler, negative_sampler=dgl.dataloading.negative_sampler.Uniform(2)))
graphs.append(hg)
nids.append(seeds)
modes.append('link')
collators.append(dgl.dataloading.EdgeCollator(
hg, seeds, sampler, exclude='reverse_types', reverse_etypes=reverse_etypes,
negative_sampler=dgl.dataloading.negative_sampler.Uniform(2)))
graphs.append(hg)
nids.append(seeds)
modes.append('link')
for _g, nid, collator, mode in zip(graphs, nids, collators, modes):
dl = DataLoader(
collator.dataset, collate_fn=collator.collate, batch_size=2, shuffle=True, drop_last=False)
_check_neighbor_sampling_dataloader(_g, nid, dl)
for _g, nid, sampler in zip(graphs, nids, samplers):
dl = dgl.sampling.NodeDataLoader(_g, nid, sampler, batch_size=2, shuffle=True, drop_last=False)
_check_neighbor_sampling_dataloader(_g, nid, dl)
_check_neighbor_sampling_dataloader(_g, nid, dl, mode)
if __name__ == '__main__':
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment