Commit 632a9af8 authored by xiang song(charlie.song)'s avatar xiang song(charlie.song) Committed by Da Zheng
Browse files

[Feature] No-Uniform Edge Sampler (#1087)

* Add weight based edge sampler

* Can run, edge weight work.
TODO: test node weight

* Fix node weight sample

* Fix y

* Update doc

* Fix syntex

* Fix

* Fix GPU test for sampler

* Fix test

* Fix

* Refactor EdgeSampler to act as class object not function that it
can record its own private states.

* clean

* Fix

* Fix

* Fix run bug on kg app

* update

* update test

* test

* Simply python API and fix some C code

* Fix

* Fix

* Fix syntex

* Fix

* Update API description
parent dd65ee21
......@@ -7,6 +7,7 @@ from numbers import Integral
import traceback
from ..._ffi.function import _init_api
from ..._ffi.object import register_object, ObjectBase
from ..._ffi.ndarray import empty
from ... import utils
from ...nodeflow import NodeFlow
......@@ -509,6 +510,16 @@ class EdgeSampler(object):
The sampler returns EdgeSubgraph, where a user can access the unique head nodes
and tail nodes directly.
This sampler allows to non-uniformly sample positive edges and negative edges.
For non-uniformly sampling positive edges, users need to provide an array of m
elements (m is the number of edges), i.e. edge_weight, each of which represents
the sampling probability of an edge. For non-uniformly sampling negative edges,
users need to provide an array of n elements, i.e. node_weight and the sampler
samples nodes based on the sampling probability to corrupt a positive edge. If
both edge_weight and node_weight are not provided, a uniformed sampler is used.
if only edge_weight is provided, the sampler will take uniform sampling when
corrupt positive edges.
When the flag `return_false_neg` is turned on, the sampler will also check
if the generated negative edges are true negative edges and will return
a vector that indicates false negative edges. The vector is stored in
......@@ -519,6 +530,11 @@ class EdgeSampler(object):
edge only if the triple (source node, destination node and relation)
matches one of the edges in the graph.
For uniform sampling, the sampler generates only num_of_edges/batch_size
samples.
For uniform sampling, the sampler generates samples infinitly.
Parameters
----------
g : DGLGraph
......@@ -527,6 +543,11 @@ class EdgeSampler(object):
The batch size (i.e, the number of edges from the graph)
seed_edges : tensor, optional
A list of edges where we sample from.
edge_weight : tensor, optional
The weight of each edge which decide the change of certain edge being sampled.
node_weight : tensor, optional
The weight of each node which decide the change of certain node being sampled.
Used in negative sampling. If not provided, uniform node sampling is used.
shuffle : bool, optional
whether randomly shuffle the list of edges where we sample from.
num_workers : int, optional
......@@ -564,6 +585,8 @@ class EdgeSampler(object):
g,
batch_size,
seed_edges=None,
edge_weight=None,
node_weight=None,
shuffle=False,
num_workers=1,
prefetch=False,
......@@ -596,6 +619,16 @@ class EdgeSampler(object):
self._seed_edges = seed_edges
if shuffle:
self._seed_edges = F.rand_shuffle(self._seed_edges)
if edge_weight is None:
self._is_uniform = True
else:
self._is_uniform = False
self._edge_weight = F.zerocopy_to_dgl_ndarray(edge_weight[self._seed_edges])
if node_weight is None:
self._node_weight = empty((0,), 'float32')
else:
self._node_weight = F.zerocopy_to_dgl_ndarray(node_weight)
self._seed_edges = utils.toindex(self._seed_edges)
if prefetch:
......@@ -606,6 +639,30 @@ class EdgeSampler(object):
self._negative_mode = negative_mode
self._neg_sample_size = neg_sample_size
self._exclude_positive = exclude_positive
if self._is_uniform:
self._sampler = _CAPI_CreateUniformEdgeSampler(
self.g._graph,
self.seed_edges.todgltensor(),
self.batch_size, # batch size
self._num_workers, # num batches
self._negative_mode,
self._neg_sample_size,
self._exclude_positive,
self._return_false_neg,
self._relations)
else:
self._sampler = _CAPI_CreateWeightedEdgeSampler(
self.g._graph,
self._seed_edges.todgltensor(),
self._edge_weight,
self._node_weight,
self._batch_size, # batch size
self._num_workers, # num batches
self._negative_mode,
self._neg_sample_size,
self._exclude_positive,
self._return_false_neg,
self._relations)
def fetch(self, current_index):
'''
......@@ -616,24 +673,19 @@ class EdgeSampler(object):
Parameters
----------
current_index : int
How many batches the sampler has generated so far.
deprecated, not used actually.
Returns
-------
list[GraphIndex] or list[(GraphIndex, GraphIndex)]
Next "bunch" of edges to be processed.
'''
subgs = _CAPI_UniformEdgeSampling(
self.g._graph,
self.seed_edges.todgltensor(),
current_index, # start batch id
self.batch_size, # batch size
self._num_workers, # num batches
self._negative_mode,
self._neg_sample_size,
self._exclude_positive,
self._return_false_neg,
self._relations)
if self._is_uniform:
subgs = _CAPI_FetchUniformEdgeSample(
self._sampler)
else:
subgs = _CAPI_FetchWeightedEdgeSample(
self._sampler)
if len(subgs) == 0:
return []
......@@ -673,7 +725,6 @@ class EdgeSampler(object):
def batch_size(self):
return self._batch_size
def create_full_nodeflow(g, num_layers, add_self_loop=False):
"""Convert a full graph to NodeFlow to run a L-layer GNN model.
......
This diff is collapsed.
......@@ -237,6 +237,7 @@ def check_head_tail(g):
def check_negative_sampler(mode, exclude_positive, neg_size):
g = generate_rand_graph(100)
num_edges = g.number_of_edges()
etype = np.random.randint(0, 10, size=g.number_of_edges(), dtype=np.int64)
g.edata['etype'] = F.copy_to(F.tensor(etype), F.cpu())
......@@ -249,7 +250,10 @@ def check_negative_sampler(mode, exclude_positive, neg_size):
EdgeSampler = getattr(dgl.contrib.sampling, 'EdgeSampler')
# Test the homogeneous graph.
for pos_edges, neg_edges in EdgeSampler(g, 50,
total_samples = 0
batch_size = 50
max_samples = num_edges
for pos_edges, neg_edges in EdgeSampler(g, batch_size,
negative_mode=mode,
neg_sample_size=neg_size,
exclude_positive=exclude_positive,
......@@ -284,8 +288,13 @@ def check_negative_sampler(mode, exclude_positive, neg_size):
else:
assert F.array_equal(g.has_edges_between(neg_src, neg_dst), exist)
total_samples += batch_size
if (total_samples >= max_samples):
break
# Test the knowledge graph.
for _, neg_edges in EdgeSampler(g, 50,
total_samples = 0
for _, neg_edges in EdgeSampler(g, batch_size,
negative_mode=mode,
neg_sample_size=neg_size,
exclude_positive=exclude_positive,
......@@ -304,12 +313,223 @@ def check_negative_sampler(mode, exclude_positive, neg_size):
etype = g.edata['etype'][eid]
exist = neg_edges.edata['etype'][i] == etype
assert F.asnumpy(exists[i]) == F.asnumpy(exist)
total_samples += batch_size
if (total_samples >= max_samples):
break
def check_weighted_negative_sampler(mode, exclude_positive, neg_size):
g = generate_rand_graph(100)
num_edges = g.number_of_edges()
num_nodes = g.number_of_nodes()
edge_weight = F.copy_to(F.tensor(np.full((num_edges,), 1, dtype=np.float32)), F.cpu())
node_weight = F.copy_to(F.tensor(np.full((num_nodes,), 1, dtype=np.float32)), F.cpu())
etype = np.random.randint(0, 10, size=num_edges, dtype=np.int64)
g.edata['etype'] = F.copy_to(F.tensor(etype), F.cpu())
pos_gsrc, pos_gdst, pos_geid = g.all_edges(form='all', order='eid')
pos_map = {}
for i in range(len(pos_geid)):
pos_d = int(F.asnumpy(pos_gdst[i]))
pos_e = int(F.asnumpy(pos_geid[i]))
pos_map[(pos_d, pos_e)] = int(F.asnumpy(pos_gsrc[i]))
EdgeSampler = getattr(dgl.contrib.sampling, 'EdgeSampler')
# Correctness check
# Test the homogeneous graph.
batch_size = 50
total_samples = 0
max_samples = num_edges
for pos_edges, neg_edges in EdgeSampler(g, batch_size,
edge_weight=edge_weight,
negative_mode=mode,
neg_sample_size=neg_size,
exclude_positive=exclude_positive,
return_false_neg=True):
pos_lsrc, pos_ldst, pos_leid = pos_edges.all_edges(form='all', order='eid')
assert_array_equal(F.asnumpy(pos_edges.parent_eid[pos_leid]),
F.asnumpy(g.edge_ids(pos_edges.parent_nid[pos_lsrc],
pos_edges.parent_nid[pos_ldst])))
neg_lsrc, neg_ldst, neg_leid = neg_edges.all_edges(form='all', order='eid')
neg_src = neg_edges.parent_nid[neg_lsrc]
neg_dst = neg_edges.parent_nid[neg_ldst]
neg_eid = neg_edges.parent_eid[neg_leid]
for i in range(len(neg_eid)):
neg_d = int(F.asnumpy(neg_dst[i]))
neg_e = int(F.asnumpy(neg_eid[i]))
assert (neg_d, neg_e) in pos_map
if exclude_positive:
assert int(F.asnumpy(neg_src[i])) != pos_map[(neg_d, neg_e)]
check_head_tail(neg_edges)
pos_tails = pos_edges.parent_nid[pos_edges.tail_nid]
neg_tails = neg_edges.parent_nid[neg_edges.tail_nid]
pos_tails = np.sort(F.asnumpy(pos_tails))
neg_tails = np.sort(F.asnumpy(neg_tails))
np.testing.assert_equal(pos_tails, neg_tails)
exist = neg_edges.edata['false_neg']
if exclude_positive:
assert np.sum(F.asnumpy(exist) == 0) == len(exist)
else:
assert F.array_equal(g.has_edges_between(neg_src, neg_dst), exist)
total_samples += batch_size
if (total_samples >= max_samples):
break
# Test the knowledge graph with edge weight provied.
total_samples = 0
for pos_edges, neg_edges in EdgeSampler(g, batch_size,
edge_weight=edge_weight,
negative_mode=mode,
neg_sample_size=neg_size,
exclude_positive=exclude_positive,
relations=g.edata['etype'],
return_false_neg=True):
neg_lsrc, neg_ldst, neg_leid = neg_edges.all_edges(form='all', order='eid')
neg_src = neg_edges.parent_nid[neg_lsrc]
neg_dst = neg_edges.parent_nid[neg_ldst]
neg_eid = neg_edges.parent_eid[neg_leid]
exists = neg_edges.edata['false_neg']
neg_edges.edata['etype'] = g.edata['etype'][neg_eid]
for i in range(len(neg_eid)):
u, v = F.asnumpy(neg_src[i]), F.asnumpy(neg_dst[i])
if g.has_edge_between(u, v):
eid = g.edge_id(u, v)
etype = g.edata['etype'][eid]
exist = neg_edges.edata['etype'][i] == etype
assert F.asnumpy(exists[i]) == F.asnumpy(exist)
total_samples += batch_size
if (total_samples >= max_samples):
break
# Test the knowledge graph with edge/node weight provied.
total_samples = 0
for pos_edges, neg_edges in EdgeSampler(g, batch_size,
edge_weight=edge_weight,
node_weight=node_weight,
negative_mode=mode,
neg_sample_size=neg_size,
exclude_positive=exclude_positive,
relations=g.edata['etype'],
return_false_neg=True):
neg_lsrc, neg_ldst, neg_leid = neg_edges.all_edges(form='all', order='eid')
neg_src = neg_edges.parent_nid[neg_lsrc]
neg_dst = neg_edges.parent_nid[neg_ldst]
neg_eid = neg_edges.parent_eid[neg_leid]
exists = neg_edges.edata['false_neg']
neg_edges.edata['etype'] = g.edata['etype'][neg_eid]
for i in range(len(neg_eid)):
u, v = F.asnumpy(neg_src[i]), F.asnumpy(neg_dst[i])
if g.has_edge_between(u, v):
eid = g.edge_id(u, v)
etype = g.edata['etype'][eid]
exist = neg_edges.edata['etype'][i] == etype
assert F.asnumpy(exists[i]) == F.asnumpy(exist)
total_samples += batch_size
if (total_samples >= max_samples):
break
# Check Rate
dgl.random.seed(0)
g = generate_rand_graph(1000)
num_edges = g.number_of_edges()
num_nodes = g.number_of_nodes()
edge_weight = F.copy_to(F.tensor(np.full((num_edges,), 1, dtype=np.float32)), F.cpu())
edge_weight[0] = F.sum(edge_weight, dim=0)
node_weight = F.copy_to(F.tensor(np.full((num_nodes,), 1, dtype=np.float32)), F.cpu())
node_weight[-1] = F.sum(node_weight, dim=0) / 200
etype = np.random.randint(0, 20, size=num_edges, dtype=np.int64)
g.edata['etype'] = F.copy_to(F.tensor(etype), F.cpu())
# Test w/o node weight.
max_samples = num_edges / 5
# Test the knowledge graph with edge weight provied.
total_samples = 0
edge_sampled = np.full((num_edges,), 0, dtype=np.int32)
node_sampled = np.full((num_nodes,), 0, dtype=np.int32)
for pos_edges, neg_edges in EdgeSampler(g, batch_size,
edge_weight=edge_weight,
negative_mode=mode,
neg_sample_size=neg_size,
exclude_positive=False,
relations=g.edata['etype'],
return_false_neg=True):
_, _, pos_leid = pos_edges.all_edges(form='all', order='eid')
neg_lsrc, neg_ldst, _ = neg_edges.all_edges(form='all', order='eid')
if 'head' in mode:
neg_src = neg_edges.parent_nid[neg_lsrc]
np.add.at(node_sampled, F.asnumpy(neg_src), 1)
else:
neg_dst = neg_edges.parent_nid[neg_ldst]
np.add.at(node_sampled, F.asnumpy(neg_dst), 1)
np.add.at(edge_sampled, F.asnumpy(pos_edges.parent_eid[pos_leid]), 1)
total_samples += batch_size
if (total_samples >= max_samples):
break
# Check rate here
edge_rate_0 = edge_sampled[0] / edge_sampled.sum()
edge_tail_half_cnt = edge_sampled[edge_sampled.shape[0] // 2:-1].sum()
edge_rate_tail_half = edge_tail_half_cnt / edge_sampled.sum()
assert np.allclose(edge_rate_0, 0.5, atol=0.05)
assert np.allclose(edge_rate_tail_half, 0.25, atol=0.05)
node_rate_0 = node_sampled[0] / node_sampled.sum()
node_tail_half_cnt = node_sampled[node_sampled.shape[0] // 2:-1].sum()
node_rate_tail_half = node_tail_half_cnt / node_sampled.sum()
assert node_rate_0 < 0.02
assert np.allclose(node_rate_tail_half, 0.5, atol=0.02)
# Test the knowledge graph with edge/node weight provied.
total_samples = 0
edge_sampled = np.full((num_edges,), 0, dtype=np.int32)
node_sampled = np.full((num_nodes,), 0, dtype=np.int32)
for pos_edges, neg_edges in EdgeSampler(g, batch_size,
edge_weight=edge_weight,
node_weight=node_weight,
negative_mode=mode,
neg_sample_size=neg_size,
exclude_positive=False,
relations=g.edata['etype'],
return_false_neg=True):
_, _, pos_leid = pos_edges.all_edges(form='all', order='eid')
neg_lsrc, neg_ldst, _ = neg_edges.all_edges(form='all', order='eid')
if 'head' in mode:
neg_src = neg_edges.parent_nid[neg_lsrc]
np.add.at(node_sampled, F.asnumpy(neg_src), 1)
else:
neg_dst = neg_edges.parent_nid[neg_ldst]
np.add.at(node_sampled, F.asnumpy(neg_dst), 1)
np.add.at(edge_sampled, F.asnumpy(pos_edges.parent_eid[pos_leid]), 1)
total_samples += batch_size
if (total_samples >= max_samples):
break
# Check rate here
edge_rate_0 = edge_sampled[0] / edge_sampled.sum()
edge_tail_half_cnt = edge_sampled[edge_sampled.shape[0] // 2:-1].sum()
edge_rate_tail_half = edge_tail_half_cnt / edge_sampled.sum()
assert np.allclose(edge_rate_0, 0.5, atol=0.05)
assert np.allclose(edge_rate_tail_half, 0.25, atol=0.05)
node_rate = node_sampled[-1] / node_sampled.sum()
node_rate_a = np.average(node_sampled[:50]) / node_sampled.sum()
node_rate_b = np.average(node_sampled[50:100]) / node_sampled.sum()
# As neg sampling does not contain duplicate nodes,
# this test takes some acceptable variation on the sample rate.
assert np.allclose(node_rate, node_rate_a * 5, atol=0.002)
assert np.allclose(node_rate_a, node_rate_b, atol=0.0002)
@unittest.skipIf(dgl.backend.backend_name == "tensorflow", reason="Core dump")
def test_negative_sampler():
check_negative_sampler('PBG-head', False, 10)
check_negative_sampler('head', True, 10)
check_negative_sampler('head', False, 10)
check_weighted_negative_sampler('PBG-head', False, 10)
check_weighted_negative_sampler('head', True, 10)
check_weighted_negative_sampler('head', False, 10)
#disable this check for now. It might take too long time.
#check_negative_sampler('head', False, 100)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment