Unverified Commit 3e454476 authored by Da Zheng's avatar Da Zheng Committed by GitHub
Browse files

edge sampler return head nodes and tail nodes. (#889)

* edge sampler return head nodes and tail nodes.

* edge subgraph.

* use VecToIdArray

* example.

* fix compile error.
parent 9c790b11
...@@ -385,6 +385,12 @@ struct Subgraph : public runtime::Object { ...@@ -385,6 +385,12 @@ struct Subgraph : public runtime::Object {
struct NegSubgraph: public Subgraph { struct NegSubgraph: public Subgraph {
/*! \brief The existence of the negative edges in the parent graph. */ /*! \brief The existence of the negative edges in the parent graph. */
IdArray exist; IdArray exist;
/*! \brief The Ids of head nodes */
IdArray head_nid;
/*! \brief The Ids of tail nodes */
IdArray tail_nid;
}; };
// Define SubgraphRef // Define SubgraphRef
......
...@@ -436,6 +436,45 @@ class LayerSampler(NodeFlowSampler): ...@@ -436,6 +436,45 @@ class LayerSampler(NodeFlowSampler):
nflows = [NodeFlow(self.g, obj) for obj in nfobjs] nflows = [NodeFlow(self.g, obj) for obj in nfobjs]
return nflows return nflows
class EdgeSubgraph(subgraph.DGLSubGraph):
''' The subgraph sampled from an edge sampler.
A user can access the head nodes and tail nodes of the subgraph directly.
'''
def __init__(self, parent, sgi, neg):
super(EdgeSubgraph, self).__init__(parent, sgi)
self.sgi = sgi
self.neg = neg
self.head = None
self.tail = None
def set_head_tail(self):
if self.head is None or self.tail is None:
if self.neg:
exist = _CAPI_GetEdgeSubgraphHead(self.sgi)
self.head = utils.toindex(exist).tousertensor()
exist = _CAPI_GetEdgeSubgraphTail(self.sgi)
self.tail = utils.toindex(exist).tousertensor()
else:
head, tail = self.all_edges()
self.head = F.unique(head)
self.tail = F.unique(tail)
@property
def head_nid(self):
''' The unique Ids of the head nodes.
'''
self.set_head_tail()
return self.head
@property
def tail_nid(self):
''' The unique Ids of the tail nodes.
'''
self.set_head_tail()
return self.tail
class EdgeSampler(object): class EdgeSampler(object):
'''Edge sampler for link prediction. '''Edge sampler for link prediction.
...@@ -465,6 +504,9 @@ class EdgeSampler(object): ...@@ -465,6 +504,9 @@ class EdgeSampler(object):
'PBG-tail': the negative edges are generated by corrupting a set 'PBG-tail': the negative edges are generated by corrupting a set
of tail nodes with the same set of nodes similar to 'PBG-head'. of tail nodes with the same set of nodes similar to 'PBG-head'.
The sampler returns EdgeSubgraph, where a user can access the unique head nodes
and tail nodes directly.
When the flag return_false_neg is turned on, the sampler will also check When the flag return_false_neg is turned on, the sampler will also check
if the generated negative edges are true negative edges and will return if the generated negative edges are true negative edges and will return
a vector that indicates false negative edges. The vector is stored in a vector that indicates false negative edges. The vector is stored in
...@@ -500,6 +542,13 @@ class EdgeSampler(object): ...@@ -500,6 +542,13 @@ class EdgeSampler(object):
relations: tensor, optional relations: tensor, optional
relations of the edges if this is a knowledge graph. relations of the edges if this is a knowledge graph.
Examples
--------
>>> for pos_g, neg_g in EdgeSampler(g, batch_size=10):
>>> print(pos_g.head_nid, pos_g.tail_nid)
>>> print(neg_g.head_nid, pos_g.tail_nid)
>>> print(neg_g.edata['false_neg'])
Class properties Class properties
---------------- ----------------
immutable_only : bool immutable_only : bool
...@@ -592,10 +641,10 @@ class EdgeSampler(object): ...@@ -592,10 +641,10 @@ class EdgeSampler(object):
assert len(subgs) % 2 == 0 assert len(subgs) % 2 == 0
num_pos = int(len(subgs) / 2) num_pos = int(len(subgs) / 2)
for i in range(num_pos): for i in range(num_pos):
pos_subg = subgraph.DGLSubGraph(self.g, subgs[i]) pos_subg = EdgeSubgraph(self.g, subgs[i], False)
neg_subg = subgraph.DGLSubGraph(self.g, subgs[i + num_pos]) neg_subg = EdgeSubgraph(self.g, subgs[i + num_pos], True)
if self._return_false_neg: if self._return_false_neg:
exist = _CAPI_GetNegEdgeExistence(subgs[i + num_pos]); exist = _CAPI_GetNegEdgeExistence(subgs[i + num_pos])
neg_subg.edata['false_neg'] = utils.toindex(exist).tousertensor() neg_subg.edata['false_neg'] = utils.toindex(exist).tousertensor()
rets.append((pos_subg, neg_subg)) rets.append((pos_subg, neg_subg))
return rets return rets
......
...@@ -107,7 +107,6 @@ class ArrayHeap { ...@@ -107,7 +107,6 @@ class ArrayHeap {
* Uniformly sample integers from [0, set_size) without replacement. * Uniformly sample integers from [0, set_size) without replacement.
*/ */
void RandomSample(size_t set_size, size_t num, std::vector<size_t>* out) { void RandomSample(size_t set_size, size_t num, std::vector<size_t>* out) {
out->clear();
if (num < set_size) { if (num < set_size) {
std::unordered_set<size_t> sampled_idxs; std::unordered_set<size_t> sampled_idxs;
while (sampled_idxs.size() < num) { while (sampled_idxs.size() < num) {
...@@ -128,7 +127,6 @@ void RandomSample(size_t set_size, size_t num, const std::vector<size_t> &exclud ...@@ -128,7 +127,6 @@ void RandomSample(size_t set_size, size_t num, const std::vector<size_t> &exclud
for (auto v : exclude) { for (auto v : exclude) {
sampled_idxs.insert(std::pair<size_t, int>(v, 0)); sampled_idxs.insert(std::pair<size_t, int>(v, 0));
} }
out->clear();
if (num + exclude.size() < set_size) { if (num + exclude.size() < set_size) {
while (sampled_idxs.size() < num + exclude.size()) { while (sampled_idxs.size() < num + exclude.size()) {
size_t rand = RandomEngine::ThreadLocal()->RandInt(set_size); size_t rand = RandomEngine::ThreadLocal()->RandInt(set_size);
...@@ -979,6 +977,17 @@ IdArray CheckExistence(GraphPtr gptr, IdArray relations, ...@@ -979,6 +977,17 @@ IdArray CheckExistence(GraphPtr gptr, IdArray relations,
return exist; return exist;
} }
std::vector<dgl_id_t> Global2Local(const std::vector<dgl_id_t> &ids,
const std::unordered_map<dgl_id_t, dgl_id_t> &map) {
std::vector<dgl_id_t> local_ids(ids.size());
for (size_t i = 0; i < ids.size(); i++) {
auto it = map.find(ids[i]);
assert(it != map.end());
local_ids[i] = it->second;
}
return local_ids;
}
NegSubgraph NegEdgeSubgraph(GraphPtr gptr, IdArray relations, const Subgraph &pos_subg, NegSubgraph NegEdgeSubgraph(GraphPtr gptr, IdArray relations, const Subgraph &pos_subg,
const std::string &neg_mode, const std::string &neg_mode,
int neg_sample_size, bool exclude_positive, int neg_sample_size, bool exclude_positive,
...@@ -1020,10 +1029,13 @@ NegSubgraph NegEdgeSubgraph(GraphPtr gptr, IdArray relations, const Subgraph &po ...@@ -1020,10 +1029,13 @@ NegSubgraph NegEdgeSubgraph(GraphPtr gptr, IdArray relations, const Subgraph &po
neg_changed = neg_dst_data; neg_changed = neg_dst_data;
} }
std::unordered_map<dgl_id_t, dgl_id_t> neg_map;
std::vector<dgl_id_t> local_pos_vids;
local_pos_vids.reserve(num_pos_edges);
dgl_id_t curr_eid = 0; dgl_id_t curr_eid = 0;
std::vector<size_t> neg_vids; std::vector<size_t> neg_vids;
neg_vids.reserve(neg_sample_size); neg_vids.reserve(neg_sample_size);
std::unordered_map<dgl_id_t, dgl_id_t> neg_map;
// If we don't exclude positive edges, we are actually sampling more than // If we don't exclude positive edges, we are actually sampling more than
// the total number of nodes in the graph. // the total number of nodes in the graph.
if (!exclude_positive && neg_sample_size >= num_tot_nodes) { if (!exclude_positive && neg_sample_size >= num_tot_nodes) {
...@@ -1032,8 +1044,30 @@ NegSubgraph NegEdgeSubgraph(GraphPtr gptr, IdArray relations, const Subgraph &po ...@@ -1032,8 +1044,30 @@ NegSubgraph NegEdgeSubgraph(GraphPtr gptr, IdArray relations, const Subgraph &po
neg_vids.push_back(i); neg_vids.push_back(i);
neg_map[i] = i; neg_map[i] = i;
} }
// Get all nodes in the positive side.
for (int64_t i = 0; i < num_pos_edges; i++) {
dgl_id_t vid = induced_vid_data[unchanged[i]];
local_pos_vids.push_back(neg_map[vid]);
}
// There is no guarantee that the nodes in the vector are unique.
std::sort(local_pos_vids.begin(), local_pos_vids.end());
auto it = std::unique(local_pos_vids.begin(), local_pos_vids.end());
local_pos_vids.resize(it - local_pos_vids.begin());
} else {
// Collect nodes in the positive side.
dgl_id_t local_vid = 0;
for (int64_t i = 0; i < num_pos_edges; i++) {
dgl_id_t vid = induced_vid_data[unchanged[i]];
auto it = neg_map.find(vid);
if (it == neg_map.end()) {
local_pos_vids.push_back(local_vid);
neg_map.insert(std::pair<dgl_id_t, dgl_id_t>(vid, local_vid++));
}
}
} }
int64_t prev_neg_offset = 0;
for (int64_t i = 0; i < num_pos_edges; i++) { for (int64_t i = 0; i < num_pos_edges; i++) {
size_t neg_idx = i * neg_sample_size; size_t neg_idx = i * neg_sample_size;
...@@ -1053,11 +1087,13 @@ NegSubgraph NegEdgeSubgraph(GraphPtr gptr, IdArray relations, const Subgraph &po ...@@ -1053,11 +1087,13 @@ NegSubgraph NegEdgeSubgraph(GraphPtr gptr, IdArray relations, const Subgraph &po
dgl_id_t global_vid = *it; dgl_id_t global_vid = *it;
exclude.push_back(global_vid); exclude.push_back(global_vid);
} }
neg_vids.clear(); prev_neg_offset = neg_vids.size();
RandomSample(num_tot_nodes, neg_sample_size, exclude, &neg_vids); RandomSample(num_tot_nodes, neg_sample_size, exclude, &neg_vids);
assert(prev_neg_offset + neg_sample_size == neg_vids.size());
} else if (neg_sample_size < num_tot_nodes) { } else if (neg_sample_size < num_tot_nodes) {
neg_vids.clear(); prev_neg_offset = neg_vids.size();
RandomSample(num_tot_nodes, neg_sample_size, &neg_vids); RandomSample(num_tot_nodes, neg_sample_size, &neg_vids);
assert(prev_neg_offset + neg_sample_size == neg_vids.size());
} else if (exclude_positive) { } else if (exclude_positive) {
LOG(FATAL) << "We can't exclude positive edges when sampling negative edges with all nodes."; LOG(FATAL) << "We can't exclude positive edges when sampling negative edges with all nodes.";
} else { } else {
...@@ -1073,7 +1109,7 @@ NegSubgraph NegEdgeSubgraph(GraphPtr gptr, IdArray relations, const Subgraph &po ...@@ -1073,7 +1109,7 @@ NegSubgraph NegEdgeSubgraph(GraphPtr gptr, IdArray relations, const Subgraph &po
for (int64_t j = 0; j < neg_sample_size; j++) { for (int64_t j = 0; j < neg_sample_size; j++) {
neg_unchanged[neg_idx + j] = local_unchanged; neg_unchanged[neg_idx + j] = local_unchanged;
neg_eid_data[neg_idx + j] = curr_eid++; neg_eid_data[neg_idx + j] = curr_eid++;
dgl_id_t local_changed = global2local_map(neg_vids[j], &neg_map); dgl_id_t local_changed = global2local_map(neg_vids[j + prev_neg_offset], &neg_map);
neg_changed[neg_idx + j] = local_changed; neg_changed[neg_idx + j] = local_changed;
// induced negative eid references to the positive one. // induced negative eid references to the positive one.
induced_neg_eid_data[neg_idx + j] = induced_eid_data[i]; induced_neg_eid_data[neg_idx + j] = induced_eid_data[i];
...@@ -1095,6 +1131,20 @@ NegSubgraph NegEdgeSubgraph(GraphPtr gptr, IdArray relations, const Subgraph &po ...@@ -1095,6 +1131,20 @@ NegSubgraph NegEdgeSubgraph(GraphPtr gptr, IdArray relations, const Subgraph &po
neg_subg.graph = GraphPtr(new ImmutableGraph(neg_coo)); neg_subg.graph = GraphPtr(new ImmutableGraph(neg_coo));
neg_subg.induced_vertices = induced_neg_vid; neg_subg.induced_vertices = induced_neg_vid;
neg_subg.induced_edges = induced_neg_eid; neg_subg.induced_edges = induced_neg_eid;
// If we didn't sample all nodes to form negative edges, some of the nodes
// in the vector might be redundant.
if (neg_sample_size < num_tot_nodes) {
std::sort(neg_vids.begin(), neg_vids.end());
auto it = std::unique(neg_vids.begin(), neg_vids.end());
neg_vids.resize(it - neg_vids.begin());
}
if (IsNegativeHeadMode(neg_mode)) {
neg_subg.head_nid = aten::VecToIdArray(Global2Local(neg_vids, neg_map));
neg_subg.tail_nid = aten::VecToIdArray(local_pos_vids);
} else {
neg_subg.head_nid = aten::VecToIdArray(local_pos_vids);
neg_subg.tail_nid = aten::VecToIdArray(Global2Local(neg_vids, neg_map));
}
// TODO(zhengda) we should provide an array of 1s if exclude_positive // TODO(zhengda) we should provide an array of 1s if exclude_positive
if (check_false_neg) { if (check_false_neg) {
if (relations->shape[0] == 0) { if (relations->shape[0] == 0) {
...@@ -1175,6 +1225,19 @@ NegSubgraph PBGNegEdgeSubgraph(GraphPtr gptr, IdArray relations, const Subgraph ...@@ -1175,6 +1225,19 @@ NegSubgraph PBGNegEdgeSubgraph(GraphPtr gptr, IdArray relations, const Subgraph
dgl_id_t curr_eid = 0; dgl_id_t curr_eid = 0;
std::unordered_map<dgl_id_t, dgl_id_t> neg_map; std::unordered_map<dgl_id_t, dgl_id_t> neg_map;
dgl_id_t local_vid = 0;
// Collect nodes in the positive side.
std::vector<dgl_id_t> local_pos_vids;
local_pos_vids.reserve(num_pos_edges);
for (int64_t i = 0; i < num_pos_edges; i++) {
dgl_id_t vid = induced_vid_data[unchanged[i]];
auto it = neg_map.find(vid);
if (it == neg_map.end()) {
local_pos_vids.push_back(local_vid);
neg_map.insert(std::pair<dgl_id_t, dgl_id_t>(vid, local_vid++));
}
}
for (int64_t i_chunk = 0; i_chunk < num_chunks; i_chunk++) { for (int64_t i_chunk = 0; i_chunk < num_chunks; i_chunk++) {
// for each chunk. // for each chunk.
int64_t neg_idx = neg_sample_size * chunk_size * i_chunk; int64_t neg_idx = neg_sample_size * chunk_size * i_chunk;
...@@ -1221,6 +1284,13 @@ NegSubgraph PBGNegEdgeSubgraph(GraphPtr gptr, IdArray relations, const Subgraph ...@@ -1221,6 +1284,13 @@ NegSubgraph PBGNegEdgeSubgraph(GraphPtr gptr, IdArray relations, const Subgraph
neg_subg.graph = GraphPtr(new ImmutableGraph(neg_coo)); neg_subg.graph = GraphPtr(new ImmutableGraph(neg_coo));
neg_subg.induced_vertices = induced_neg_vid; neg_subg.induced_vertices = induced_neg_vid;
neg_subg.induced_edges = induced_neg_eid; neg_subg.induced_edges = induced_neg_eid;
if (IsNegativeHeadMode(neg_mode)) {
neg_subg.head_nid = aten::VecToIdArray(Global2Local(neg_vids, neg_map));
neg_subg.tail_nid = aten::VecToIdArray(local_pos_vids);
} else {
neg_subg.head_nid = aten::VecToIdArray(local_pos_vids);
neg_subg.tail_nid = aten::VecToIdArray(Global2Local(neg_vids, neg_map));
}
if (check_false_neg) { if (check_false_neg) {
if (relations->shape[0] == 0) { if (relations->shape[0] == 0) {
neg_subg.exist = CheckExistence(gptr, neg_src, neg_dst, induced_neg_vid); neg_subg.exist = CheckExistence(gptr, neg_src, neg_dst, induced_neg_vid);
...@@ -1311,4 +1381,18 @@ DGL_REGISTER_GLOBAL("sampling._CAPI_GetNegEdgeExistence") ...@@ -1311,4 +1381,18 @@ DGL_REGISTER_GLOBAL("sampling._CAPI_GetNegEdgeExistence")
*rv = gptr->exist; *rv = gptr->exist;
}); });
DGL_REGISTER_GLOBAL("sampling._CAPI_GetEdgeSubgraphHead")
.set_body([] (DGLArgs args, DGLRetValue* rv) {
SubgraphRef g = args[0];
auto gptr = std::dynamic_pointer_cast<NegSubgraph>(g.sptr());
*rv = gptr->head_nid;
});
DGL_REGISTER_GLOBAL("sampling._CAPI_GetEdgeSubgraphTail")
.set_body([] (DGLArgs args, DGLRetValue* rv) {
SubgraphRef g = args[0];
auto gptr = std::dynamic_pointer_cast<NegSubgraph>(g.sptr());
*rv = gptr->tail_nid;
});
} // namespace dgl } // namespace dgl
...@@ -53,8 +53,9 @@ def verify_subgraph(g, subg, seed_id): ...@@ -53,8 +53,9 @@ def verify_subgraph(g, subg, seed_id):
assert(is_sorted(child_src)) assert(is_sorted(child_src))
# a neighbor in the subgraph must also exist in parent graph. # a neighbor in the subgraph must also exist in parent graph.
src = F.asnumpy(src)
for i in subg.map_to_parent_nid(child_src): for i in subg.map_to_parent_nid(child_src):
assert i in src assert F.asnumpy(i) in src
def test_1neighbor_sampler(): def test_1neighbor_sampler():
g = generate_rand_graph(100) g = generate_rand_graph(100)
...@@ -186,7 +187,7 @@ def test_nonuniform_neighbor_sampler(): ...@@ -186,7 +187,7 @@ def test_nonuniform_neighbor_sampler():
assert nf.num_layers == 100 assert nf.num_layers == 100
for i in range(nf.num_layers): for i in range(nf.num_layers):
assert nf.layer_size(i) == 1 assert nf.layer_size(i) == 1
assert nf.layer_parent_nid(i)[0] == i assert F.asnumpy(nf.layer_parent_nid(i)[0]) == i
# Test the reverse direction # Test the reverse direction
sampler = dgl.contrib.sampling.NeighborSampler( sampler = dgl.contrib.sampling.NeighborSampler(
...@@ -196,7 +197,7 @@ def test_nonuniform_neighbor_sampler(): ...@@ -196,7 +197,7 @@ def test_nonuniform_neighbor_sampler():
assert nf.num_layers == 100 assert nf.num_layers == 100
for i in range(nf.num_layers): for i in range(nf.num_layers):
assert nf.layer_size(i) == 1 assert nf.layer_size(i) == 1
assert nf.layer_parent_nid(i)[0] == 99 - i assert F.asnumpy(nf.layer_parent_nid(i)[0]) == 99 - i
def test_setseed(): def test_setseed():
g = generate_rand_graph(100) g = generate_rand_graph(100)
...@@ -220,6 +221,19 @@ def test_setseed(): ...@@ -220,6 +221,19 @@ def test_setseed():
g, 5, 3, num_hops=2, neighbor_type='in', num_workers=4)): g, 5, 3, num_hops=2, neighbor_type='in', num_workers=4)):
pass pass
def check_head_tail(g):
lsrc, ldst, leid = g.all_edges(form='all', order='eid')
lsrc = np.unique(F.asnumpy(lsrc))
head_nid = np.unique(F.asnumpy(g.head_nid))
assert len(head_nid) == len(g.head_nid)
np.testing.assert_equal(lsrc, head_nid)
ldst = np.unique(F.asnumpy(ldst))
tail_nid = np.unique(F.asnumpy(g.tail_nid))
assert len(tail_nid) == len(g.tail_nid)
np.testing.assert_equal(tail_nid, ldst)
def check_negative_sampler(mode, exclude_positive, neg_size): def check_negative_sampler(mode, exclude_positive, neg_size):
g = generate_rand_graph(100) g = generate_rand_graph(100)
etype = np.random.randint(0, 10, size=g.number_of_edges(), dtype=np.int64) etype = np.random.randint(0, 10, size=g.number_of_edges(), dtype=np.int64)
...@@ -245,6 +259,7 @@ def check_negative_sampler(mode, exclude_positive, neg_size): ...@@ -245,6 +259,7 @@ def check_negative_sampler(mode, exclude_positive, neg_size):
pos_edges.parent_nid[pos_ldst]))) pos_edges.parent_nid[pos_ldst])))
neg_lsrc, neg_ldst, neg_leid = neg_edges.all_edges(form='all', order='eid') neg_lsrc, neg_ldst, neg_leid = neg_edges.all_edges(form='all', order='eid')
neg_src = neg_edges.parent_nid[neg_lsrc] neg_src = neg_edges.parent_nid[neg_lsrc]
neg_dst = neg_edges.parent_nid[neg_ldst] neg_dst = neg_edges.parent_nid[neg_ldst]
neg_eid = neg_edges.parent_eid[neg_leid] neg_eid = neg_edges.parent_eid[neg_leid]
...@@ -255,6 +270,13 @@ def check_negative_sampler(mode, exclude_positive, neg_size): ...@@ -255,6 +270,13 @@ def check_negative_sampler(mode, exclude_positive, neg_size):
if exclude_positive: if exclude_positive:
assert int(F.asnumpy(neg_src[i])) != pos_map[(neg_d, neg_e)] assert int(F.asnumpy(neg_src[i])) != pos_map[(neg_d, neg_e)]
check_head_tail(neg_edges)
pos_tails = pos_edges.parent_nid[pos_edges.tail_nid]
neg_tails = neg_edges.parent_nid[neg_edges.tail_nid]
pos_tails = np.sort(F.asnumpy(pos_tails))
neg_tails = np.sort(F.asnumpy(neg_tails))
np.testing.assert_equal(pos_tails, neg_tails)
exist = neg_edges.edata['false_neg'] exist = neg_edges.edata['false_neg']
if exclude_positive: if exclude_positive:
assert np.sum(F.asnumpy(exist) == 0) == len(exist) assert np.sum(F.asnumpy(exist) == 0) == len(exist)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment