"src/array/cuda/negative_sampling.hip" did not exist on "1c9d2a03023c64380d69b24f6e6bd0393417f69d"
Commit 632a9af8 authored by xiang song(charlie.song)'s avatar xiang song(charlie.song) Committed by Da Zheng
Browse files

[Feature] No-Uniform Edge Sampler (#1087)

* Add weight based edge sampler

* Can run, edge weight work.
TODO: test node weight

* Fix node weight sample

* Fix y

* Update doc

* Fix syntex

* Fix

* Fix GPU test for sampler

* Fix test

* Fix

* Refactor EdgeSampler to act as class object not function that it
can record its own private states.

* clean

* Fix

* Fix

* Fix run bug on kg app

* update

* update test

* test

* Simply python API and fix some C code

* Fix

* Fix

* Fix syntex

* Fix

* Update API description
parent dd65ee21
...@@ -7,6 +7,7 @@ from numbers import Integral ...@@ -7,6 +7,7 @@ from numbers import Integral
import traceback import traceback
from ..._ffi.function import _init_api from ..._ffi.function import _init_api
from ..._ffi.object import register_object, ObjectBase
from ..._ffi.ndarray import empty from ..._ffi.ndarray import empty
from ... import utils from ... import utils
from ...nodeflow import NodeFlow from ...nodeflow import NodeFlow
...@@ -509,6 +510,16 @@ class EdgeSampler(object): ...@@ -509,6 +510,16 @@ class EdgeSampler(object):
The sampler returns EdgeSubgraph, where a user can access the unique head nodes The sampler returns EdgeSubgraph, where a user can access the unique head nodes
and tail nodes directly. and tail nodes directly.
This sampler allows to non-uniformly sample positive edges and negative edges.
For non-uniformly sampling positive edges, users need to provide an array of m
elements (m is the number of edges), i.e. edge_weight, each of which represents
the sampling probability of an edge. For non-uniformly sampling negative edges,
users need to provide an array of n elements, i.e. node_weight and the sampler
samples nodes based on the sampling probability to corrupt a positive edge. If
both edge_weight and node_weight are not provided, a uniformed sampler is used.
if only edge_weight is provided, the sampler will take uniform sampling when
corrupt positive edges.
When the flag `return_false_neg` is turned on, the sampler will also check When the flag `return_false_neg` is turned on, the sampler will also check
if the generated negative edges are true negative edges and will return if the generated negative edges are true negative edges and will return
a vector that indicates false negative edges. The vector is stored in a vector that indicates false negative edges. The vector is stored in
...@@ -519,6 +530,11 @@ class EdgeSampler(object): ...@@ -519,6 +530,11 @@ class EdgeSampler(object):
edge only if the triple (source node, destination node and relation) edge only if the triple (source node, destination node and relation)
matches one of the edges in the graph. matches one of the edges in the graph.
For uniform sampling, the sampler generates only num_of_edges/batch_size
samples.
For uniform sampling, the sampler generates samples infinitly.
Parameters Parameters
---------- ----------
g : DGLGraph g : DGLGraph
...@@ -527,6 +543,11 @@ class EdgeSampler(object): ...@@ -527,6 +543,11 @@ class EdgeSampler(object):
The batch size (i.e, the number of edges from the graph) The batch size (i.e, the number of edges from the graph)
seed_edges : tensor, optional seed_edges : tensor, optional
A list of edges where we sample from. A list of edges where we sample from.
edge_weight : tensor, optional
The weight of each edge which decide the change of certain edge being sampled.
node_weight : tensor, optional
The weight of each node which decide the change of certain node being sampled.
Used in negative sampling. If not provided, uniform node sampling is used.
shuffle : bool, optional shuffle : bool, optional
whether randomly shuffle the list of edges where we sample from. whether randomly shuffle the list of edges where we sample from.
num_workers : int, optional num_workers : int, optional
...@@ -564,6 +585,8 @@ class EdgeSampler(object): ...@@ -564,6 +585,8 @@ class EdgeSampler(object):
g, g,
batch_size, batch_size,
seed_edges=None, seed_edges=None,
edge_weight=None,
node_weight=None,
shuffle=False, shuffle=False,
num_workers=1, num_workers=1,
prefetch=False, prefetch=False,
...@@ -596,6 +619,16 @@ class EdgeSampler(object): ...@@ -596,6 +619,16 @@ class EdgeSampler(object):
self._seed_edges = seed_edges self._seed_edges = seed_edges
if shuffle: if shuffle:
self._seed_edges = F.rand_shuffle(self._seed_edges) self._seed_edges = F.rand_shuffle(self._seed_edges)
if edge_weight is None:
self._is_uniform = True
else:
self._is_uniform = False
self._edge_weight = F.zerocopy_to_dgl_ndarray(edge_weight[self._seed_edges])
if node_weight is None:
self._node_weight = empty((0,), 'float32')
else:
self._node_weight = F.zerocopy_to_dgl_ndarray(node_weight)
self._seed_edges = utils.toindex(self._seed_edges) self._seed_edges = utils.toindex(self._seed_edges)
if prefetch: if prefetch:
...@@ -606,6 +639,30 @@ class EdgeSampler(object): ...@@ -606,6 +639,30 @@ class EdgeSampler(object):
self._negative_mode = negative_mode self._negative_mode = negative_mode
self._neg_sample_size = neg_sample_size self._neg_sample_size = neg_sample_size
self._exclude_positive = exclude_positive self._exclude_positive = exclude_positive
if self._is_uniform:
self._sampler = _CAPI_CreateUniformEdgeSampler(
self.g._graph,
self.seed_edges.todgltensor(),
self.batch_size, # batch size
self._num_workers, # num batches
self._negative_mode,
self._neg_sample_size,
self._exclude_positive,
self._return_false_neg,
self._relations)
else:
self._sampler = _CAPI_CreateWeightedEdgeSampler(
self.g._graph,
self._seed_edges.todgltensor(),
self._edge_weight,
self._node_weight,
self._batch_size, # batch size
self._num_workers, # num batches
self._negative_mode,
self._neg_sample_size,
self._exclude_positive,
self._return_false_neg,
self._relations)
def fetch(self, current_index): def fetch(self, current_index):
''' '''
...@@ -616,24 +673,19 @@ class EdgeSampler(object): ...@@ -616,24 +673,19 @@ class EdgeSampler(object):
Parameters Parameters
---------- ----------
current_index : int current_index : int
How many batches the sampler has generated so far. deprecated, not used actually.
Returns Returns
------- -------
list[GraphIndex] or list[(GraphIndex, GraphIndex)] list[GraphIndex] or list[(GraphIndex, GraphIndex)]
Next "bunch" of edges to be processed. Next "bunch" of edges to be processed.
''' '''
subgs = _CAPI_UniformEdgeSampling( if self._is_uniform:
self.g._graph, subgs = _CAPI_FetchUniformEdgeSample(
self.seed_edges.todgltensor(), self._sampler)
current_index, # start batch id else:
self.batch_size, # batch size subgs = _CAPI_FetchWeightedEdgeSample(
self._num_workers, # num batches self._sampler)
self._negative_mode,
self._neg_sample_size,
self._exclude_positive,
self._return_false_neg,
self._relations)
if len(subgs) == 0: if len(subgs) == 0:
return [] return []
...@@ -673,7 +725,6 @@ class EdgeSampler(object): ...@@ -673,7 +725,6 @@ class EdgeSampler(object):
def batch_size(self): def batch_size(self):
return self._batch_size return self._batch_size
def create_full_nodeflow(g, num_layers, add_self_loop=False): def create_full_nodeflow(g, num_layers, add_self_loop=False):
"""Convert a full graph to NodeFlow to run a L-layer GNN model. """Convert a full graph to NodeFlow to run a L-layer GNN model.
......
This diff is collapsed.
...@@ -237,6 +237,7 @@ def check_head_tail(g): ...@@ -237,6 +237,7 @@ def check_head_tail(g):
def check_negative_sampler(mode, exclude_positive, neg_size): def check_negative_sampler(mode, exclude_positive, neg_size):
g = generate_rand_graph(100) g = generate_rand_graph(100)
num_edges = g.number_of_edges()
etype = np.random.randint(0, 10, size=g.number_of_edges(), dtype=np.int64) etype = np.random.randint(0, 10, size=g.number_of_edges(), dtype=np.int64)
g.edata['etype'] = F.copy_to(F.tensor(etype), F.cpu()) g.edata['etype'] = F.copy_to(F.tensor(etype), F.cpu())
...@@ -249,7 +250,10 @@ def check_negative_sampler(mode, exclude_positive, neg_size): ...@@ -249,7 +250,10 @@ def check_negative_sampler(mode, exclude_positive, neg_size):
EdgeSampler = getattr(dgl.contrib.sampling, 'EdgeSampler') EdgeSampler = getattr(dgl.contrib.sampling, 'EdgeSampler')
# Test the homogeneous graph. # Test the homogeneous graph.
for pos_edges, neg_edges in EdgeSampler(g, 50, total_samples = 0
batch_size = 50
max_samples = num_edges
for pos_edges, neg_edges in EdgeSampler(g, batch_size,
negative_mode=mode, negative_mode=mode,
neg_sample_size=neg_size, neg_sample_size=neg_size,
exclude_positive=exclude_positive, exclude_positive=exclude_positive,
...@@ -284,8 +288,13 @@ def check_negative_sampler(mode, exclude_positive, neg_size): ...@@ -284,8 +288,13 @@ def check_negative_sampler(mode, exclude_positive, neg_size):
else: else:
assert F.array_equal(g.has_edges_between(neg_src, neg_dst), exist) assert F.array_equal(g.has_edges_between(neg_src, neg_dst), exist)
total_samples += batch_size
if (total_samples >= max_samples):
break
# Test the knowledge graph. # Test the knowledge graph.
for _, neg_edges in EdgeSampler(g, 50, total_samples = 0
for _, neg_edges in EdgeSampler(g, batch_size,
negative_mode=mode, negative_mode=mode,
neg_sample_size=neg_size, neg_sample_size=neg_size,
exclude_positive=exclude_positive, exclude_positive=exclude_positive,
...@@ -304,12 +313,223 @@ def check_negative_sampler(mode, exclude_positive, neg_size): ...@@ -304,12 +313,223 @@ def check_negative_sampler(mode, exclude_positive, neg_size):
etype = g.edata['etype'][eid] etype = g.edata['etype'][eid]
exist = neg_edges.edata['etype'][i] == etype exist = neg_edges.edata['etype'][i] == etype
assert F.asnumpy(exists[i]) == F.asnumpy(exist) assert F.asnumpy(exists[i]) == F.asnumpy(exist)
total_samples += batch_size
if (total_samples >= max_samples):
break
def check_weighted_negative_sampler(mode, exclude_positive, neg_size):
g = generate_rand_graph(100)
num_edges = g.number_of_edges()
num_nodes = g.number_of_nodes()
edge_weight = F.copy_to(F.tensor(np.full((num_edges,), 1, dtype=np.float32)), F.cpu())
node_weight = F.copy_to(F.tensor(np.full((num_nodes,), 1, dtype=np.float32)), F.cpu())
etype = np.random.randint(0, 10, size=num_edges, dtype=np.int64)
g.edata['etype'] = F.copy_to(F.tensor(etype), F.cpu())
pos_gsrc, pos_gdst, pos_geid = g.all_edges(form='all', order='eid')
pos_map = {}
for i in range(len(pos_geid)):
pos_d = int(F.asnumpy(pos_gdst[i]))
pos_e = int(F.asnumpy(pos_geid[i]))
pos_map[(pos_d, pos_e)] = int(F.asnumpy(pos_gsrc[i]))
EdgeSampler = getattr(dgl.contrib.sampling, 'EdgeSampler')
# Correctness check
# Test the homogeneous graph.
batch_size = 50
total_samples = 0
max_samples = num_edges
for pos_edges, neg_edges in EdgeSampler(g, batch_size,
edge_weight=edge_weight,
negative_mode=mode,
neg_sample_size=neg_size,
exclude_positive=exclude_positive,
return_false_neg=True):
pos_lsrc, pos_ldst, pos_leid = pos_edges.all_edges(form='all', order='eid')
assert_array_equal(F.asnumpy(pos_edges.parent_eid[pos_leid]),
F.asnumpy(g.edge_ids(pos_edges.parent_nid[pos_lsrc],
pos_edges.parent_nid[pos_ldst])))
neg_lsrc, neg_ldst, neg_leid = neg_edges.all_edges(form='all', order='eid')
neg_src = neg_edges.parent_nid[neg_lsrc]
neg_dst = neg_edges.parent_nid[neg_ldst]
neg_eid = neg_edges.parent_eid[neg_leid]
for i in range(len(neg_eid)):
neg_d = int(F.asnumpy(neg_dst[i]))
neg_e = int(F.asnumpy(neg_eid[i]))
assert (neg_d, neg_e) in pos_map
if exclude_positive:
assert int(F.asnumpy(neg_src[i])) != pos_map[(neg_d, neg_e)]
check_head_tail(neg_edges)
pos_tails = pos_edges.parent_nid[pos_edges.tail_nid]
neg_tails = neg_edges.parent_nid[neg_edges.tail_nid]
pos_tails = np.sort(F.asnumpy(pos_tails))
neg_tails = np.sort(F.asnumpy(neg_tails))
np.testing.assert_equal(pos_tails, neg_tails)
exist = neg_edges.edata['false_neg']
if exclude_positive:
assert np.sum(F.asnumpy(exist) == 0) == len(exist)
else:
assert F.array_equal(g.has_edges_between(neg_src, neg_dst), exist)
total_samples += batch_size
if (total_samples >= max_samples):
break
# Test the knowledge graph with edge weight provied.
total_samples = 0
for pos_edges, neg_edges in EdgeSampler(g, batch_size,
edge_weight=edge_weight,
negative_mode=mode,
neg_sample_size=neg_size,
exclude_positive=exclude_positive,
relations=g.edata['etype'],
return_false_neg=True):
neg_lsrc, neg_ldst, neg_leid = neg_edges.all_edges(form='all', order='eid')
neg_src = neg_edges.parent_nid[neg_lsrc]
neg_dst = neg_edges.parent_nid[neg_ldst]
neg_eid = neg_edges.parent_eid[neg_leid]
exists = neg_edges.edata['false_neg']
neg_edges.edata['etype'] = g.edata['etype'][neg_eid]
for i in range(len(neg_eid)):
u, v = F.asnumpy(neg_src[i]), F.asnumpy(neg_dst[i])
if g.has_edge_between(u, v):
eid = g.edge_id(u, v)
etype = g.edata['etype'][eid]
exist = neg_edges.edata['etype'][i] == etype
assert F.asnumpy(exists[i]) == F.asnumpy(exist)
total_samples += batch_size
if (total_samples >= max_samples):
break
# Test the knowledge graph with edge/node weight provied.
total_samples = 0
for pos_edges, neg_edges in EdgeSampler(g, batch_size,
edge_weight=edge_weight,
node_weight=node_weight,
negative_mode=mode,
neg_sample_size=neg_size,
exclude_positive=exclude_positive,
relations=g.edata['etype'],
return_false_neg=True):
neg_lsrc, neg_ldst, neg_leid = neg_edges.all_edges(form='all', order='eid')
neg_src = neg_edges.parent_nid[neg_lsrc]
neg_dst = neg_edges.parent_nid[neg_ldst]
neg_eid = neg_edges.parent_eid[neg_leid]
exists = neg_edges.edata['false_neg']
neg_edges.edata['etype'] = g.edata['etype'][neg_eid]
for i in range(len(neg_eid)):
u, v = F.asnumpy(neg_src[i]), F.asnumpy(neg_dst[i])
if g.has_edge_between(u, v):
eid = g.edge_id(u, v)
etype = g.edata['etype'][eid]
exist = neg_edges.edata['etype'][i] == etype
assert F.asnumpy(exists[i]) == F.asnumpy(exist)
total_samples += batch_size
if (total_samples >= max_samples):
break
# Check Rate
dgl.random.seed(0)
g = generate_rand_graph(1000)
num_edges = g.number_of_edges()
num_nodes = g.number_of_nodes()
edge_weight = F.copy_to(F.tensor(np.full((num_edges,), 1, dtype=np.float32)), F.cpu())
edge_weight[0] = F.sum(edge_weight, dim=0)
node_weight = F.copy_to(F.tensor(np.full((num_nodes,), 1, dtype=np.float32)), F.cpu())
node_weight[-1] = F.sum(node_weight, dim=0) / 200
etype = np.random.randint(0, 20, size=num_edges, dtype=np.int64)
g.edata['etype'] = F.copy_to(F.tensor(etype), F.cpu())
# Test w/o node weight.
max_samples = num_edges / 5
# Test the knowledge graph with edge weight provied.
total_samples = 0
edge_sampled = np.full((num_edges,), 0, dtype=np.int32)
node_sampled = np.full((num_nodes,), 0, dtype=np.int32)
for pos_edges, neg_edges in EdgeSampler(g, batch_size,
edge_weight=edge_weight,
negative_mode=mode,
neg_sample_size=neg_size,
exclude_positive=False,
relations=g.edata['etype'],
return_false_neg=True):
_, _, pos_leid = pos_edges.all_edges(form='all', order='eid')
neg_lsrc, neg_ldst, _ = neg_edges.all_edges(form='all', order='eid')
if 'head' in mode:
neg_src = neg_edges.parent_nid[neg_lsrc]
np.add.at(node_sampled, F.asnumpy(neg_src), 1)
else:
neg_dst = neg_edges.parent_nid[neg_ldst]
np.add.at(node_sampled, F.asnumpy(neg_dst), 1)
np.add.at(edge_sampled, F.asnumpy(pos_edges.parent_eid[pos_leid]), 1)
total_samples += batch_size
if (total_samples >= max_samples):
break
# Check rate here
edge_rate_0 = edge_sampled[0] / edge_sampled.sum()
edge_tail_half_cnt = edge_sampled[edge_sampled.shape[0] // 2:-1].sum()
edge_rate_tail_half = edge_tail_half_cnt / edge_sampled.sum()
assert np.allclose(edge_rate_0, 0.5, atol=0.05)
assert np.allclose(edge_rate_tail_half, 0.25, atol=0.05)
node_rate_0 = node_sampled[0] / node_sampled.sum()
node_tail_half_cnt = node_sampled[node_sampled.shape[0] // 2:-1].sum()
node_rate_tail_half = node_tail_half_cnt / node_sampled.sum()
assert node_rate_0 < 0.02
assert np.allclose(node_rate_tail_half, 0.5, atol=0.02)
# Test the knowledge graph with edge/node weight provied.
total_samples = 0
edge_sampled = np.full((num_edges,), 0, dtype=np.int32)
node_sampled = np.full((num_nodes,), 0, dtype=np.int32)
for pos_edges, neg_edges in EdgeSampler(g, batch_size,
edge_weight=edge_weight,
node_weight=node_weight,
negative_mode=mode,
neg_sample_size=neg_size,
exclude_positive=False,
relations=g.edata['etype'],
return_false_neg=True):
_, _, pos_leid = pos_edges.all_edges(form='all', order='eid')
neg_lsrc, neg_ldst, _ = neg_edges.all_edges(form='all', order='eid')
if 'head' in mode:
neg_src = neg_edges.parent_nid[neg_lsrc]
np.add.at(node_sampled, F.asnumpy(neg_src), 1)
else:
neg_dst = neg_edges.parent_nid[neg_ldst]
np.add.at(node_sampled, F.asnumpy(neg_dst), 1)
np.add.at(edge_sampled, F.asnumpy(pos_edges.parent_eid[pos_leid]), 1)
total_samples += batch_size
if (total_samples >= max_samples):
break
# Check rate here
edge_rate_0 = edge_sampled[0] / edge_sampled.sum()
edge_tail_half_cnt = edge_sampled[edge_sampled.shape[0] // 2:-1].sum()
edge_rate_tail_half = edge_tail_half_cnt / edge_sampled.sum()
assert np.allclose(edge_rate_0, 0.5, atol=0.05)
assert np.allclose(edge_rate_tail_half, 0.25, atol=0.05)
node_rate = node_sampled[-1] / node_sampled.sum()
node_rate_a = np.average(node_sampled[:50]) / node_sampled.sum()
node_rate_b = np.average(node_sampled[50:100]) / node_sampled.sum()
# As neg sampling does not contain duplicate nodes,
# this test takes some acceptable variation on the sample rate.
assert np.allclose(node_rate, node_rate_a * 5, atol=0.002)
assert np.allclose(node_rate_a, node_rate_b, atol=0.0002)
@unittest.skipIf(dgl.backend.backend_name == "tensorflow", reason="Core dump") @unittest.skipIf(dgl.backend.backend_name == "tensorflow", reason="Core dump")
def test_negative_sampler(): def test_negative_sampler():
check_negative_sampler('PBG-head', False, 10) check_negative_sampler('PBG-head', False, 10)
check_negative_sampler('head', True, 10) check_negative_sampler('head', True, 10)
check_negative_sampler('head', False, 10) check_negative_sampler('head', False, 10)
check_weighted_negative_sampler('PBG-head', False, 10)
check_weighted_negative_sampler('head', True, 10)
check_weighted_negative_sampler('head', False, 10)
#disable this check for now. It might take too long time. #disable this check for now. It might take too long time.
#check_negative_sampler('head', False, 100) #check_negative_sampler('head', False, 100)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment