Unverified Commit 1022d5d6 authored by Da Zheng's avatar Da Zheng Committed by GitHub
Browse files

[KG] PBG's way of constructing negative edges (#1159)

* attach positive.

* add neg_deg_sample.

* add comment.

* add neg_deg_sample for eval.

* change the edge sampler.

* rename edge sampler in KG.

* allow specifying chunk size and negative sample size separately.

* fix bugs in KG.

* add check in sampler.

* add more checks.

* fix

* add comment.

* add comments.
parent 1de192f4
......@@ -107,13 +107,14 @@ class TrainDataset(object):
count[(tail, -rel - 1)] += 1
return count
def create_sampler(self, batch_size, neg_sample_size=2, mode='head', num_workers=5,
def create_sampler(self, batch_size, neg_sample_size=2, neg_chunk_size=None, mode='head', num_workers=5,
shuffle=True, exclude_positive=False, rank=0):
EdgeSampler = getattr(dgl.contrib.sampling, 'EdgeSampler')
return EdgeSampler(self.g,
seed_edges=F.tensor(self.edge_parts[rank]),
batch_size=batch_size,
neg_sample_size=neg_sample_size,
chunk_size=neg_chunk_size,
negative_mode=mode,
num_workers=num_workers,
shuffle=shuffle,
......@@ -121,10 +122,10 @@ class TrainDataset(object):
return_false_neg=False)
class PBGNegEdgeSubgraph(dgl.subgraph.DGLSubGraph):
class ChunkNegEdgeSubgraph(dgl.subgraph.DGLSubGraph):
def __init__(self, subg, num_chunks, chunk_size,
neg_sample_size, neg_head):
super(PBGNegEdgeSubgraph, self).__init__(subg._parent, subg.sgi)
super(ChunkNegEdgeSubgraph, self).__init__(subg._parent, subg.sgi)
self.subg = subg
self.num_chunks = num_chunks
self.chunk_size = chunk_size
......@@ -140,7 +141,11 @@ class PBGNegEdgeSubgraph(dgl.subgraph.DGLSubGraph):
return self.subg.tail_nid
def create_neg_subgraph(pos_g, neg_g, is_pbg, neg_head, num_nodes):
# KG models need to know the number of chunks, the chunk size and negative sample size
# of a negative subgraph to perform the computation more efficiently.
# This function tries to infer all of these information of the negative subgraph
# and create a wrapper class that contains all of the information.
def create_neg_subgraph(pos_g, neg_g, chunk_size, is_chunked, neg_head, num_nodes):
assert neg_g.number_of_edges() % pos_g.number_of_edges() == 0
neg_sample_size = int(neg_g.number_of_edges() / pos_g.number_of_edges())
# We use all nodes to create negative edges. Regardless of the sampling algorithm,
......@@ -149,30 +154,32 @@ def create_neg_subgraph(pos_g, neg_g, is_pbg, neg_head, num_nodes):
or (not neg_head and len(neg_g.tail_nid) == num_nodes):
num_chunks = 1
chunk_size = pos_g.number_of_edges()
elif is_pbg:
if pos_g.number_of_edges() < neg_sample_size:
elif is_chunked:
if pos_g.number_of_edges() < chunk_size:
num_chunks = 1
chunk_size = pos_g.number_of_edges()
else:
# This is probably the last batch. Let's ignore it.
if pos_g.number_of_edges() % neg_sample_size > 0:
if pos_g.number_of_edges() % chunk_size > 0:
return None
num_chunks = int(pos_g.number_of_edges()/ neg_sample_size)
chunk_size = neg_sample_size
num_chunks = int(pos_g.number_of_edges()/ chunk_size)
assert num_chunks * chunk_size == pos_g.number_of_edges()
assert num_chunks * neg_sample_size * chunk_size == neg_g.number_of_edges()
else:
num_chunks = pos_g.number_of_edges()
chunk_size = 1
return PBGNegEdgeSubgraph(neg_g, num_chunks, chunk_size,
neg_sample_size, neg_head)
return ChunkNegEdgeSubgraph(neg_g, num_chunks, chunk_size,
neg_sample_size, neg_head)
class EvalSampler(object):
def __init__(self, g, edges, batch_size, neg_sample_size, mode, num_workers,
def __init__(self, g, edges, batch_size, neg_sample_size, neg_chunk_size, mode, num_workers,
filter_false_neg):
EdgeSampler = getattr(dgl.contrib.sampling, 'EdgeSampler')
self.sampler = EdgeSampler(g,
batch_size=batch_size,
seed_edges=edges,
neg_sample_size=neg_sample_size,
chunk_size=neg_chunk_size,
negative_mode=mode,
num_workers=num_workers,
shuffle=False,
......@@ -184,6 +191,7 @@ class EvalSampler(object):
self.neg_head = 'head' in mode
self.g = g
self.filter_false_neg = filter_false_neg
self.neg_chunk_size = neg_chunk_size
def __iter__(self):
return self
......@@ -193,7 +201,7 @@ class EvalSampler(object):
pos_g, neg_g = next(self.sampler_iter)
if self.filter_false_neg:
neg_positive = neg_g.edata['false_neg']
neg_g = create_neg_subgraph(pos_g, neg_g, 'PBG' in self.mode,
neg_g = create_neg_subgraph(pos_g, neg_g, self.neg_chunk_size, 'chunk' in self.mode,
self.neg_head, self.g.number_of_nodes())
if neg_g is not None:
break
......@@ -280,22 +288,22 @@ class EvalDataset(object):
np.testing.assert_equal(F.asnumpy(dst_id), orig_dst)
np.testing.assert_equal(F.asnumpy(etype), orig_etype)
def create_sampler(self, eval_type, batch_size, neg_sample_size,
def create_sampler(self, eval_type, batch_size, neg_sample_size, neg_chunk_size,
filter_false_neg, mode='head', num_workers=5, rank=0, ranks=1):
edges = self.get_edges(eval_type)
beg = edges.shape[0] * rank // ranks
end = min(edges.shape[0] * (rank + 1) // ranks, edges.shape[0])
edges = edges[beg: end]
return EvalSampler(self.g, edges, batch_size, neg_sample_size,
return EvalSampler(self.g, edges, batch_size, neg_sample_size, neg_chunk_size,
mode, num_workers, filter_false_neg)
class NewBidirectionalOneShotIterator:
def __init__(self, dataloader_head, dataloader_tail, is_pbg, num_nodes):
def __init__(self, dataloader_head, dataloader_tail, neg_chunk_size, is_chunked, num_nodes):
self.sampler_head = dataloader_head
self.sampler_tail = dataloader_tail
self.iterator_head = self.one_shot_iterator(dataloader_head, is_pbg,
self.iterator_head = self.one_shot_iterator(dataloader_head, neg_chunk_size, is_chunked,
True, num_nodes)
self.iterator_tail = self.one_shot_iterator(dataloader_tail, is_pbg,
self.iterator_tail = self.one_shot_iterator(dataloader_tail, neg_chunk_size, is_chunked,
False, num_nodes)
self.step = 0
......@@ -308,10 +316,11 @@ class NewBidirectionalOneShotIterator:
return pos_g, neg_g
@staticmethod
def one_shot_iterator(dataloader, is_pbg, neg_head, num_nodes):
def one_shot_iterator(dataloader, neg_chunk_size, is_chunked, neg_head, num_nodes):
while True:
for pos_g, neg_g in dataloader:
neg_g = create_neg_subgraph(pos_g, neg_g, is_pbg, neg_head, num_nodes)
neg_g = create_neg_subgraph(pos_g, neg_g, neg_chunk_size, is_chunked,
neg_head, num_nodes)
if neg_g is None:
continue
......
......@@ -38,6 +38,10 @@ class ArgParser(argparse.ArgumentParser):
help='batch size used for eval and test')
self.add_argument('--neg_sample_size', type=int, default=-1,
help='negative sampling size for testing')
self.add_argument('--neg_deg_sample', action='store_true',
help='negative sampling proportional to vertex degree for testing')
self.add_argument('--neg_chunk_size', type=int, default=-1,
help='chunk size of the negative edges.')
self.add_argument('--hidden_dim', type=int, default=256,
help='hidden dim used by relation and entity')
self.add_argument('-g', '--gamma', type=float, default=12.0,
......@@ -86,6 +90,10 @@ def get_logger(args):
return logger
def main(args):
args.eval_filter = not args.no_eval_filter
if args.neg_deg_sample:
assert not args.eval_filter, "if negative sampling based on degree, we can't filter positive edges."
# load dataset and samplers
dataset = get_dataset(args.data_path, args.dataset, args.format)
args.pickle_graph = False
......@@ -98,10 +106,14 @@ def main(args):
# Here we want to use the regualr negative sampler because we need to ensure that
# all positive edges are excluded.
eval_dataset = EvalDataset(dataset, args)
args.neg_sample_size_test = args.neg_sample_size
args.neg_deg_sample_eval = args.neg_deg_sample
if args.neg_sample_size < 0:
args.neg_sample_size_test = args.neg_sample_size = eval_dataset.g.number_of_nodes()
args.eval_filter = not args.no_eval_filter
if args.neg_chunk_size < 0:
args.neg_chunk_size = args.neg_sample_size
num_workers = args.num_worker
# for multiprocessing evaluation, we don't need to sample multiple batches at a time
# in each process.
......@@ -113,14 +125,16 @@ def main(args):
for i in range(args.num_proc):
test_sampler_head = eval_dataset.create_sampler('test', args.batch_size,
args.neg_sample_size,
args.neg_chunk_size,
args.eval_filter,
mode='PBG-head',
mode='chunk-head',
num_workers=num_workers,
rank=i, ranks=args.num_proc)
test_sampler_tail = eval_dataset.create_sampler('test', args.batch_size,
args.neg_sample_size,
args.neg_chunk_size,
args.eval_filter,
mode='PBG-tail',
mode='chunk-tail',
num_workers=num_workers,
rank=i, ranks=args.num_proc)
test_sampler_heads.append(test_sampler_head)
......@@ -128,14 +142,16 @@ def main(args):
else:
test_sampler_head = eval_dataset.create_sampler('test', args.batch_size,
args.neg_sample_size,
args.neg_chunk_size,
args.eval_filter,
mode='PBG-head',
mode='chunk-head',
num_workers=num_workers,
rank=0, ranks=1)
test_sampler_tail = eval_dataset.create_sampler('test', args.batch_size,
args.neg_sample_size,
args.neg_chunk_size,
args.eval_filter,
mode='PBG-tail',
mode='chunk-tail',
num_workers=num_workers,
rank=0, ranks=1)
......
......@@ -94,36 +94,67 @@ class KEModel(object):
self.score_func(g)
return g.edata['score']
def predict_neg_score(self, pos_g, neg_g, to_device=None, gpu_id=-1, trace=False):
def predict_neg_score(self, pos_g, neg_g, to_device=None, gpu_id=-1, trace=False,
neg_deg_sample=False):
num_chunks = neg_g.num_chunks
chunk_size = neg_g.chunk_size
neg_sample_size = neg_g.neg_sample_size
mask = F.ones((num_chunks, chunk_size * (neg_sample_size + chunk_size)),
dtype=F.float32, ctx=F.context(pos_g.ndata['emb']))
if neg_g.neg_head:
neg_head_ids = neg_g.ndata['id'][neg_g.head_nid]
neg_head = self.entity_emb(neg_head_ids, gpu_id, trace)
_, tail_ids = pos_g.all_edges(order='eid')
head_ids, tail_ids = pos_g.all_edges(order='eid')
if to_device is not None and gpu_id >= 0:
tail_ids = to_device(tail_ids, gpu_id)
tail = pos_g.ndata['emb'][tail_ids]
rel = pos_g.edata['emb']
# When we train a batch, we could use the head nodes of the positive edges to
# construct negative edges. We construct a negative edge between a positive head
# node and every positive tail node.
# When we construct negative edges like this, we know there is one positive
# edge for a positive head node among the negative edges. We need to mask
# them.
if neg_deg_sample:
head = pos_g.ndata['emb'][head_ids]
head = head.reshape(num_chunks, chunk_size, -1)
neg_head = neg_head.reshape(num_chunks, neg_sample_size, -1)
neg_head = F.cat([head, neg_head], 1)
neg_sample_size = chunk_size + neg_sample_size
mask[:,0::(neg_sample_size + 1)] = 0
neg_head = neg_head.reshape(num_chunks * neg_sample_size, -1)
neg_head, tail = self.head_neg_prepare(pos_g.edata['id'], num_chunks, neg_head, tail, gpu_id, trace)
neg_score = self.head_neg_score(neg_head, rel, tail,
num_chunks, chunk_size, neg_sample_size)
else:
neg_tail_ids = neg_g.ndata['id'][neg_g.tail_nid]
neg_tail = self.entity_emb(neg_tail_ids, gpu_id, trace)
head_ids, _ = pos_g.all_edges(order='eid')
head_ids, tail_ids = pos_g.all_edges(order='eid')
if to_device is not None and gpu_id >= 0:
head_ids = to_device(head_ids, gpu_id)
head = pos_g.ndata['emb'][head_ids]
rel = pos_g.edata['emb']
# This is negative edge construction similar to the above.
if neg_deg_sample:
tail = pos_g.ndata['emb'][tail_ids]
tail = tail.reshape(num_chunks, chunk_size, -1)
neg_tail = neg_tail.reshape(num_chunks, neg_sample_size, -1)
neg_tail = F.cat([tail, neg_tail], 1)
neg_sample_size = chunk_size + neg_sample_size
mask[:,0::(neg_sample_size + 1)] = 0
neg_tail = neg_tail.reshape(num_chunks * neg_sample_size, -1)
head, neg_tail = self.tail_neg_prepare(pos_g.edata['id'], num_chunks, head, neg_tail, gpu_id, trace)
neg_score = self.tail_neg_score(head, rel, neg_tail,
num_chunks, chunk_size, neg_sample_size)
return neg_score
if neg_deg_sample:
neg_g.neg_sample_size = neg_sample_size
mask = mask.reshape(num_chunks, chunk_size, neg_sample_size)
return neg_score * mask
else:
return neg_score
def forward_test(self, pos_g, neg_g, logs, gpu_id=-1):
pos_g.ndata['emb'] = self.entity_emb(pos_g.ndata['id'], gpu_id, False)
......@@ -136,7 +167,8 @@ class KEModel(object):
pos_scores = reshape(logsigmoid(pos_scores), batch_size, -1)
neg_scores = self.predict_neg_score(pos_g, neg_g, to_device=cuda,
gpu_id=gpu_id, trace=False)
gpu_id=gpu_id, trace=False,
neg_deg_sample=self.args.neg_deg_sample_eval)
neg_scores = reshape(logsigmoid(neg_scores), batch_size, -1)
# We need to filter the positive edges in the negative graph.
......@@ -171,9 +203,11 @@ class KEModel(object):
pos_score = logsigmoid(pos_score)
if gpu_id >= 0:
neg_score = self.predict_neg_score(pos_g, neg_g, to_device=cuda,
gpu_id=gpu_id, trace=True)
gpu_id=gpu_id, trace=True,
neg_deg_sample=self.args.neg_deg_sample)
else:
neg_score = self.predict_neg_score(pos_g, neg_g, trace=True)
neg_score = self.predict_neg_score(pos_g, neg_g, trace=True,
neg_deg_sample=self.args.neg_deg_sample)
neg_score = reshape(neg_score, -1, neg_g.neg_sample_size)
# Adversarial sampling
......
......@@ -137,14 +137,14 @@ def check_score_func(func_name):
EdgeSampler = getattr(dgl.contrib.sampling, 'EdgeSampler')
sampler = EdgeSampler(g, batch_size=batch_size,
neg_sample_size=neg_sample_size,
negative_mode='PBG-head',
negative_mode='chunk-head',
num_workers=1,
shuffle=False,
exclude_positive=False,
return_false_neg=False)
for pos_g, neg_g in sampler:
neg_g = create_neg_subgraph(pos_g, neg_g, True, True, g.number_of_nodes())
neg_g = create_neg_subgraph(pos_g, neg_g, neg_sample_size, True, True, g.number_of_nodes())
pos_g.copy_from_parent()
neg_g.copy_from_parent()
score1 = F.reshape(model.predict_score(neg_g), (batch_size, -1))
......
......@@ -47,10 +47,20 @@ class ArgParser(argparse.ArgumentParser):
help='batch size used for eval and test')
self.add_argument('--neg_sample_size', type=int, default=128,
help='negative sampling size')
self.add_argument('--neg_chunk_size', type=int, default=-1,
help='chunk size of the negative edges.')
self.add_argument('--neg_deg_sample', action='store_true',
help='negative sample proportional to vertex degree in the training')
self.add_argument('--neg_deg_sample_eval', action='store_true',
help='negative sampling proportional to vertex degree in the evaluation')
self.add_argument('--neg_sample_size_valid', type=int, default=1000,
help='negative sampling size for validation')
self.add_argument('--neg_chunk_size_valid', type=int, default=-1,
help='chunk size of the negative edges.')
self.add_argument('--neg_sample_size_test', type=int, default=-1,
help='negative sampling size for testing')
self.add_argument('--neg_chunk_size_test', type=int, default=-1,
help='chunk size of the negative edges.')
self.add_argument('--hidden_dim', type=int, default=256,
help='hidden dim used by relation and entity')
self.add_argument('--lr', type=float, default=0.0001,
......@@ -138,37 +148,56 @@ def run(args, logger):
if args.neg_sample_size_test < 0:
args.neg_sample_size_test = n_entities
args.eval_filter = not args.no_eval_filter
if args.neg_deg_sample_eval:
assert not args.eval_filter, "if negative sampling based on degree, we can't filter positive edges."
# When we generate a batch of negative edges from a set of positive edges,
# we first divide the positive edges into chunks and corrupt the edges in a chunk
# together. By default, the chunk size is equal to the negative sample size.
# Usually, this works well. But we also allow users to specify the chunk size themselves.
if args.neg_chunk_size < 0:
args.neg_chunk_size = args.neg_sample_size
if args.neg_chunk_size_valid < 0:
args.neg_chunk_size_valid = args.neg_sample_size_valid
if args.neg_chunk_size_test < 0:
args.neg_chunk_size_test = args.neg_sample_size_test
train_data = TrainDataset(dataset, args, ranks=args.num_proc)
if args.num_proc > 1:
train_samplers = []
for i in range(args.num_proc):
train_sampler_head = train_data.create_sampler(args.batch_size, args.neg_sample_size,
mode='PBG-head',
args.neg_chunk_size,
mode='chunk-head',
num_workers=args.num_worker,
shuffle=True,
exclude_positive=True,
rank=i)
train_sampler_tail = train_data.create_sampler(args.batch_size, args.neg_sample_size,
mode='PBG-tail',
args.neg_chunk_size,
mode='chunk-tail',
num_workers=args.num_worker,
shuffle=True,
exclude_positive=True,
rank=i)
train_samplers.append(NewBidirectionalOneShotIterator(train_sampler_head, train_sampler_tail,
args.neg_chunk_size,
True, n_entities))
else:
train_sampler_head = train_data.create_sampler(args.batch_size, args.neg_sample_size,
mode='PBG-head',
args.neg_chunk_size,
mode='chunk-head',
num_workers=args.num_worker,
shuffle=True,
exclude_positive=True)
train_sampler_tail = train_data.create_sampler(args.batch_size, args.neg_sample_size,
mode='PBG-tail',
args.neg_chunk_size,
mode='chunk-tail',
num_workers=args.num_worker,
shuffle=True,
exclude_positive=True)
train_sampler = NewBidirectionalOneShotIterator(train_sampler_head, train_sampler_tail,
args.neg_chunk_size,
True, n_entities)
# for multiprocessing evaluation, we don't need to sample multiple batches at a time
......@@ -187,14 +216,16 @@ def run(args, logger):
for i in range(args.num_proc):
valid_sampler_head = eval_dataset.create_sampler('valid', args.batch_size_eval,
args.neg_sample_size_valid,
args.neg_chunk_size_valid,
args.eval_filter,
mode='PBG-head',
mode='chunk-head',
num_workers=num_workers,
rank=i, ranks=args.num_proc)
valid_sampler_tail = eval_dataset.create_sampler('valid', args.batch_size_eval,
args.neg_sample_size_valid,
args.neg_chunk_size_valid,
args.eval_filter,
mode='PBG-tail',
mode='chunk-tail',
num_workers=num_workers,
rank=i, ranks=args.num_proc)
valid_sampler_heads.append(valid_sampler_head)
......@@ -202,14 +233,16 @@ def run(args, logger):
else:
valid_sampler_head = eval_dataset.create_sampler('valid', args.batch_size_eval,
args.neg_sample_size_valid,
args.neg_chunk_size_valid,
args.eval_filter,
mode='PBG-head',
mode='chunk-head',
num_workers=num_workers,
rank=0, ranks=1)
valid_sampler_tail = eval_dataset.create_sampler('valid', args.batch_size_eval,
args.neg_sample_size_valid,
args.neg_chunk_size_valid,
args.eval_filter,
mode='PBG-tail',
mode='chunk-tail',
num_workers=num_workers,
rank=0, ranks=1)
if args.test:
......@@ -221,14 +254,16 @@ def run(args, logger):
for i in range(args.num_proc):
test_sampler_head = eval_dataset.create_sampler('test', args.batch_size_eval,
args.neg_sample_size_test,
args.neg_chunk_size_test,
args.eval_filter,
mode='PBG-head',
mode='chunk-head',
num_workers=num_workers,
rank=i, ranks=args.num_proc)
test_sampler_tail = eval_dataset.create_sampler('test', args.batch_size_eval,
args.neg_sample_size_test,
args.neg_chunk_size_test,
args.eval_filter,
mode='PBG-tail',
mode='chunk-tail',
num_workers=num_workers,
rank=i, ranks=args.num_proc)
test_sampler_heads.append(test_sampler_head)
......@@ -236,14 +271,16 @@ def run(args, logger):
else:
test_sampler_head = eval_dataset.create_sampler('test', args.batch_size_eval,
args.neg_sample_size_test,
args.neg_chunk_size_test,
args.eval_filter,
mode='PBG-head',
mode='chunk-head',
num_workers=num_workers,
rank=0, ranks=1)
test_sampler_tail = eval_dataset.create_sampler('test', args.batch_size_eval,
args.neg_sample_size_test,
args.neg_chunk_size_test,
args.eval_filter,
mode='PBG-tail',
mode='chunk-tail',
num_workers=num_workers,
rank=0, ranks=1)
......
......@@ -500,12 +500,16 @@ class EdgeSampler(object):
* 'tail': the negative edges are generated by corrupting tail nodes with uniformly randomly sampled nodes,
* 'PBG-head': the negative edges are generated by corrupting a set \
of head nodes with the same set of nodes uniformly randomly sampled \
from the graph. Please see Pytorch-BigGraph for more details.
* 'chunk-head': the negative edges are generated for a chunk of positive edges. \
It first groups positive edges into chunks and corrupts a chunk of edges together \
by replacing a set of head nodes with the same set of nodes uniformly randomly sampled \
from the graph.
* 'PBG-tail': the negative edges are generated by corrupting a set \
of tail nodes with the same set of nodes similar to 'PBG-head'.
* 'chunk-tail': the negative edges are generated by corrupting a set \
of tail nodes with the same set of nodes similar to 'chunk-head'.
When we use chunked negative sampling, a chunk size needs to be specified. By default,
the chunk size is the same as the number of negative edges.
The sampler returns EdgeSubgraph, where a user can access the unique head nodes
and tail nodes directly.
......@@ -577,6 +581,8 @@ class EdgeSampler(object):
The method used to construct negative edges. Possible values are 'head', 'tail'.
neg_sample_size : int, optional
The number of negative edges to sample for each edge.
chunk_size : int, optional
The chunk size for chunked negative sampling.
exclude_positive : int, optional
Whether to exclude positive edges from the negative edges.
return_false_neg: bool, optional
......@@ -615,7 +621,8 @@ class EdgeSampler(object):
neg_sample_size=0,
exclude_positive=False,
return_false_neg=False,
relations=None):
relations=None,
chunk_size=None):
self._g = g
if self.immutable_only and not g._graph.is_readonly():
raise NotImplementedError("This loader only support read-only graphs.")
......@@ -659,8 +666,16 @@ class EdgeSampler(object):
self._replacement = replacement
self._reset = reset
if chunk_size is None and negative_mode in ('chunk-head', 'chunk-tail'):
chunk_size = neg_sample_size
elif chunk_size is None:
chunk_size = -1
assert negative_mode in ('', 'head', 'tail', 'chunk-head', 'chunk-tail')
self._num_workers = int(num_workers)
self._negative_mode = negative_mode
self._chunk_size = chunk_size
self._neg_sample_size = neg_sample_size
self._exclude_positive = exclude_positive
if self._is_uniform:
......@@ -675,7 +690,8 @@ class EdgeSampler(object):
self._neg_sample_size,
self._exclude_positive,
self._return_false_neg,
self._relations)
self._relations,
self._chunk_size)
else:
self._sampler = _CAPI_CreateWeightedEdgeSampler(
self.g._graph,
......@@ -690,7 +706,8 @@ class EdgeSampler(object):
self._neg_sample_size,
self._exclude_positive,
self._return_false_neg,
self._relations)
self._relations,
self._chunk_size)
def fetch(self, current_index):
'''
......
......@@ -125,6 +125,7 @@ class EdgeSamplerObject: public Object {
const bool reset,
const std::string neg_mode,
const int64_t neg_sample_size,
const int64_t chunk_size,
const bool exclude_positive,
const bool check_false_neg,
IdArray relations) {
......@@ -140,6 +141,7 @@ class EdgeSamplerObject: public Object {
neg_sample_size_ = neg_sample_size;
exclude_positive_ = exclude_positive;
check_false_neg_ = check_false_neg;
chunk_size_ = chunk_size;
}
~EdgeSamplerObject() {}
......@@ -157,11 +159,11 @@ class EdgeSamplerObject: public Object {
int64_t neg_sample_size,
bool exclude_positive,
bool check_false_neg);
NegSubgraph genPBGNegEdgeSubgraph(const Subgraph &pos_subg,
const std::string &neg_mode,
int64_t neg_sample_size,
bool exclude_positive,
bool check_false_neg);
NegSubgraph genChunkedNegEdgeSubgraph(const Subgraph &pos_subg,
const std::string &neg_mode,
int64_t neg_sample_size,
bool exclude_positive,
bool check_false_neg);
GraphPtr gptr_;
IdArray seed_edges_;
......@@ -175,6 +177,7 @@ class EdgeSamplerObject: public Object {
int64_t neg_sample_size_;
bool exclude_positive_;
bool check_false_neg_;
int64_t chunk_size_;
};
/*
......@@ -1250,11 +1253,11 @@ NegSubgraph EdgeSamplerObject::genNegEdgeSubgraph(const Subgraph &pos_subg,
return neg_subg;
}
NegSubgraph EdgeSamplerObject::genPBGNegEdgeSubgraph(const Subgraph &pos_subg,
const std::string &neg_mode,
int64_t neg_sample_size,
bool exclude_positive,
bool check_false_neg) {
NegSubgraph EdgeSamplerObject::genChunkedNegEdgeSubgraph(const Subgraph &pos_subg,
const std::string &neg_mode,
int64_t neg_sample_size,
bool exclude_positive,
bool check_false_neg) {
int64_t num_tot_nodes = gptr_->NumVertices();
std::vector<IdArray> adj = pos_subg.graph->GetAdj(false, "coo");
IdArray coo = adj[0];
......@@ -1262,7 +1265,8 @@ NegSubgraph EdgeSamplerObject::genPBGNegEdgeSubgraph(const Subgraph &pos_subg,
if (neg_sample_size > num_tot_nodes)
neg_sample_size = num_tot_nodes;
int64_t chunk_size = neg_sample_size;
int64_t chunk_size = chunk_size_;
CHECK_GT(chunk_size, 0) << "chunk size has to be positive";
// If num_pos_edges isn't divisible by chunk_size, the actual number of chunks
// is num_chunks + 1 and the last chunk size is last_chunk_size.
// Otherwise, the actual number of chunks is num_chunks, the last chunk size
......@@ -1316,6 +1320,7 @@ NegSubgraph EdgeSamplerObject::genPBGNegEdgeSubgraph(const Subgraph &pos_subg,
randomSample(num_tot_nodes,
num_chunks * neg_sample_size,
&global_neg_vids);
CHECK_EQ(num_chunks * neg_sample_size, global_neg_vids.size());
std::unordered_map<dgl_id_t, dgl_id_t> neg_map;
dgl_id_t local_vid = 0;
......@@ -1438,6 +1443,7 @@ public:
const bool reset,
const std::string neg_mode,
const int64_t neg_sample_size,
const int64_t chunk_size,
const bool exclude_positive,
const bool check_false_neg,
IdArray relations)
......@@ -1449,6 +1455,7 @@ public:
reset,
neg_mode,
neg_sample_size,
chunk_size,
exclude_positive,
check_false_neg,
relations) {
......@@ -1499,15 +1506,15 @@ public:
Subgraph subg = gptr_->EdgeSubgraph(worker_seeds, false);
positive_subgs[i] = ConvertRef(subg);
// For PBG negative sampling, we accept "PBG-head" for corrupting head
// nodes and "PBG-tail" for corrupting tail nodes.
if (neg_mode_.substr(0, 3) == "PBG") {
NegSubgraph neg_subg = genPBGNegEdgeSubgraph(subg, neg_mode_.substr(4),
neg_sample_size_,
exclude_positive_,
check_false_neg_);
// For chunked negative sampling, we accept "chunk-head" for corrupting head
// nodes and "chunk-tail" for corrupting tail nodes.
if (neg_mode_.substr(0, 5) == "chunk") {
NegSubgraph neg_subg = genChunkedNegEdgeSubgraph(subg, neg_mode_.substr(6),
neg_sample_size_,
exclude_positive_,
check_false_neg_);
negative_subgs[i] = ConvertRef(neg_subg);
} else if (neg_mode_.size() > 0) {
} else if (neg_mode_ == "head" || neg_mode_ == "tail") {
NegSubgraph neg_subg = genNegEdgeSubgraph(subg, neg_mode_,
neg_sample_size_,
exclude_positive_,
......@@ -1585,6 +1592,7 @@ DGL_REGISTER_GLOBAL("sampling._CAPI_CreateUniformEdgeSampler")
const bool exclude_positive = args[8];
const bool check_false_neg = args[9];
IdArray relations = args[10];
const int64_t chunk_size = args[11];
// process args
auto gptr = std::dynamic_pointer_cast<ImmutableGraph>(g.sptr());
CHECK(gptr) << "sampling isn't implemented in mutable graph";
......@@ -1607,6 +1615,7 @@ DGL_REGISTER_GLOBAL("sampling._CAPI_CreateUniformEdgeSampler")
reset,
neg_mode,
neg_sample_size,
chunk_size,
exclude_positive,
check_false_neg,
relations);
......@@ -1638,6 +1647,7 @@ class WeightedEdgeSamplerObject: public EdgeSamplerObject {
const bool reset,
const std::string neg_mode,
const int64_t neg_sample_size,
const int64_t chunk_size,
const bool exclude_positive,
const bool check_false_neg,
IdArray relations)
......@@ -1649,6 +1659,7 @@ class WeightedEdgeSamplerObject: public EdgeSamplerObject {
reset,
neg_mode,
neg_sample_size,
chunk_size,
exclude_positive,
check_false_neg,
relations) {
......@@ -1723,15 +1734,15 @@ class WeightedEdgeSamplerObject: public EdgeSamplerObject {
// TODO(zhengda) what if there are duplicates in the src and dst vectors.
Subgraph subg = gptr_->EdgeSubgraph(worker_seeds, false);
positive_subgs[i] = ConvertRef(subg);
// For PBG negative sampling, we accept "PBG-head" for corrupting head
// nodes and "PBG-tail" for corrupting tail nodes.
if (neg_mode_.substr(0, 3) == "PBG") {
NegSubgraph neg_subg = genPBGNegEdgeSubgraph(subg, neg_mode_.substr(4),
neg_sample_size_,
exclude_positive_,
check_false_neg_);
// For chunked negative sampling, we accept "chunk-head" for corrupting head
// nodes and "chunk-tail" for corrupting tail nodes.
if (neg_mode_.substr(0, 5) == "chunk") {
NegSubgraph neg_subg = genChunkedNegEdgeSubgraph(subg, neg_mode_.substr(6),
neg_sample_size_,
exclude_positive_,
check_false_neg_);
negative_subgs[i] = ConvertRef(neg_subg);
} else if (neg_mode_.size() > 0) {
} else if (neg_mode_ == "head" || neg_mode_ == "tail") {
NegSubgraph neg_subg = genNegEdgeSubgraph(subg, neg_mode_,
neg_sample_size_,
exclude_positive_,
......@@ -1867,6 +1878,7 @@ DGL_REGISTER_GLOBAL("sampling._CAPI_CreateWeightedEdgeSampler")
const bool exclude_positive = args[10];
const bool check_false_neg = args[11];
IdArray relations = args[12];
const int64_t chunk_size = args[13];
auto gptr = std::dynamic_pointer_cast<ImmutableGraph>(g.sptr());
CHECK(gptr) << "sampling isn't implemented in mutable graph";
......@@ -1904,6 +1916,7 @@ DGL_REGISTER_GLOBAL("sampling._CAPI_CreateWeightedEdgeSampler")
reset,
neg_mode,
neg_sample_size,
chunk_size,
exclude_positive,
check_false_neg,
relations);
......
......@@ -700,10 +700,10 @@ def check_positive_edge_sampler():
@unittest.skipIf(dgl.backend.backend_name == "tensorflow", reason="TF doesn't support item assignment")
def test_negative_sampler():
check_negative_sampler('PBG-head', False, 10)
check_negative_sampler('chunk-head', False, 10)
check_negative_sampler('head', True, 10)
check_negative_sampler('head', False, 10)
check_weighted_negative_sampler('PBG-head', False, 10)
check_weighted_negative_sampler('chunk-head', False, 10)
check_weighted_negative_sampler('head', True, 10)
check_weighted_negative_sampler('head', False, 10)
check_positive_edge_sampler()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment