Unverified Commit 3e454476 authored by Da Zheng's avatar Da Zheng Committed by GitHub
Browse files

edge sampler return head nodes and tail nodes. (#889)

* edge sampler return head nodes and tail nodes.

* edge subgraph.

* use VecToIdArray

* example.

* fix compile error.
parent 9c790b11
......@@ -385,6 +385,12 @@ struct Subgraph : public runtime::Object {
struct NegSubgraph: public Subgraph {
/*! \brief The existence of the negative edges in the parent graph. */
IdArray exist;
/*! \brief The Ids of head nodes */
IdArray head_nid;
/*! \brief The Ids of tail nodes */
IdArray tail_nid;
};
// Define SubgraphRef
......
......@@ -436,6 +436,45 @@ class LayerSampler(NodeFlowSampler):
nflows = [NodeFlow(self.g, obj) for obj in nfobjs]
return nflows
class EdgeSubgraph(subgraph.DGLSubGraph):
''' The subgraph sampled from an edge sampler.
A user can access the head nodes and tail nodes of the subgraph directly.
'''
def __init__(self, parent, sgi, neg):
super(EdgeSubgraph, self).__init__(parent, sgi)
self.sgi = sgi
self.neg = neg
self.head = None
self.tail = None
def set_head_tail(self):
if self.head is None or self.tail is None:
if self.neg:
exist = _CAPI_GetEdgeSubgraphHead(self.sgi)
self.head = utils.toindex(exist).tousertensor()
exist = _CAPI_GetEdgeSubgraphTail(self.sgi)
self.tail = utils.toindex(exist).tousertensor()
else:
head, tail = self.all_edges()
self.head = F.unique(head)
self.tail = F.unique(tail)
@property
def head_nid(self):
''' The unique Ids of the head nodes.
'''
self.set_head_tail()
return self.head
@property
def tail_nid(self):
''' The unique Ids of the tail nodes.
'''
self.set_head_tail()
return self.tail
class EdgeSampler(object):
'''Edge sampler for link prediction.
......@@ -465,6 +504,9 @@ class EdgeSampler(object):
'PBG-tail': the negative edges are generated by corrupting a set
of tail nodes with the same set of nodes similar to 'PBG-head'.
The sampler returns EdgeSubgraph, where a user can access the unique head nodes
and tail nodes directly.
When the flag return_false_neg is turned on, the sampler will also check
if the generated negative edges are true negative edges and will return
a vector that indicates false negative edges. The vector is stored in
......@@ -500,6 +542,13 @@ class EdgeSampler(object):
relations: tensor, optional
relations of the edges if this is a knowledge graph.
Examples
--------
>>> for pos_g, neg_g in EdgeSampler(g, batch_size=10):
>>> print(pos_g.head_nid, pos_g.tail_nid)
>>> print(neg_g.head_nid, pos_g.tail_nid)
>>> print(neg_g.edata['false_neg'])
Class properties
----------------
immutable_only : bool
......@@ -592,10 +641,10 @@ class EdgeSampler(object):
assert len(subgs) % 2 == 0
num_pos = int(len(subgs) / 2)
for i in range(num_pos):
pos_subg = subgraph.DGLSubGraph(self.g, subgs[i])
neg_subg = subgraph.DGLSubGraph(self.g, subgs[i + num_pos])
pos_subg = EdgeSubgraph(self.g, subgs[i], False)
neg_subg = EdgeSubgraph(self.g, subgs[i + num_pos], True)
if self._return_false_neg:
exist = _CAPI_GetNegEdgeExistence(subgs[i + num_pos]);
exist = _CAPI_GetNegEdgeExistence(subgs[i + num_pos])
neg_subg.edata['false_neg'] = utils.toindex(exist).tousertensor()
rets.append((pos_subg, neg_subg))
return rets
......
......@@ -107,7 +107,6 @@ class ArrayHeap {
* Uniformly sample integers from [0, set_size) without replacement.
*/
void RandomSample(size_t set_size, size_t num, std::vector<size_t>* out) {
out->clear();
if (num < set_size) {
std::unordered_set<size_t> sampled_idxs;
while (sampled_idxs.size() < num) {
......@@ -128,7 +127,6 @@ void RandomSample(size_t set_size, size_t num, const std::vector<size_t> &exclud
for (auto v : exclude) {
sampled_idxs.insert(std::pair<size_t, int>(v, 0));
}
out->clear();
if (num + exclude.size() < set_size) {
while (sampled_idxs.size() < num + exclude.size()) {
size_t rand = RandomEngine::ThreadLocal()->RandInt(set_size);
......@@ -979,6 +977,17 @@ IdArray CheckExistence(GraphPtr gptr, IdArray relations,
return exist;
}
std::vector<dgl_id_t> Global2Local(const std::vector<dgl_id_t> &ids,
const std::unordered_map<dgl_id_t, dgl_id_t> &map) {
std::vector<dgl_id_t> local_ids(ids.size());
for (size_t i = 0; i < ids.size(); i++) {
auto it = map.find(ids[i]);
assert(it != map.end());
local_ids[i] = it->second;
}
return local_ids;
}
NegSubgraph NegEdgeSubgraph(GraphPtr gptr, IdArray relations, const Subgraph &pos_subg,
const std::string &neg_mode,
int neg_sample_size, bool exclude_positive,
......@@ -1020,10 +1029,13 @@ NegSubgraph NegEdgeSubgraph(GraphPtr gptr, IdArray relations, const Subgraph &po
neg_changed = neg_dst_data;
}
std::unordered_map<dgl_id_t, dgl_id_t> neg_map;
std::vector<dgl_id_t> local_pos_vids;
local_pos_vids.reserve(num_pos_edges);
dgl_id_t curr_eid = 0;
std::vector<size_t> neg_vids;
neg_vids.reserve(neg_sample_size);
std::unordered_map<dgl_id_t, dgl_id_t> neg_map;
// If we don't exclude positive edges, we are actually sampling more than
// the total number of nodes in the graph.
if (!exclude_positive && neg_sample_size >= num_tot_nodes) {
......@@ -1032,8 +1044,30 @@ NegSubgraph NegEdgeSubgraph(GraphPtr gptr, IdArray relations, const Subgraph &po
neg_vids.push_back(i);
neg_map[i] = i;
}
// Get all nodes in the positive side.
for (int64_t i = 0; i < num_pos_edges; i++) {
dgl_id_t vid = induced_vid_data[unchanged[i]];
local_pos_vids.push_back(neg_map[vid]);
}
// There is no guarantee that the nodes in the vector are unique.
std::sort(local_pos_vids.begin(), local_pos_vids.end());
auto it = std::unique(local_pos_vids.begin(), local_pos_vids.end());
local_pos_vids.resize(it - local_pos_vids.begin());
} else {
// Collect nodes in the positive side.
dgl_id_t local_vid = 0;
for (int64_t i = 0; i < num_pos_edges; i++) {
dgl_id_t vid = induced_vid_data[unchanged[i]];
auto it = neg_map.find(vid);
if (it == neg_map.end()) {
local_pos_vids.push_back(local_vid);
neg_map.insert(std::pair<dgl_id_t, dgl_id_t>(vid, local_vid++));
}
}
}
int64_t prev_neg_offset = 0;
for (int64_t i = 0; i < num_pos_edges; i++) {
size_t neg_idx = i * neg_sample_size;
......@@ -1053,11 +1087,13 @@ NegSubgraph NegEdgeSubgraph(GraphPtr gptr, IdArray relations, const Subgraph &po
dgl_id_t global_vid = *it;
exclude.push_back(global_vid);
}
neg_vids.clear();
prev_neg_offset = neg_vids.size();
RandomSample(num_tot_nodes, neg_sample_size, exclude, &neg_vids);
assert(prev_neg_offset + neg_sample_size == neg_vids.size());
} else if (neg_sample_size < num_tot_nodes) {
neg_vids.clear();
prev_neg_offset = neg_vids.size();
RandomSample(num_tot_nodes, neg_sample_size, &neg_vids);
assert(prev_neg_offset + neg_sample_size == neg_vids.size());
} else if (exclude_positive) {
LOG(FATAL) << "We can't exclude positive edges when sampling negative edges with all nodes.";
} else {
......@@ -1073,7 +1109,7 @@ NegSubgraph NegEdgeSubgraph(GraphPtr gptr, IdArray relations, const Subgraph &po
for (int64_t j = 0; j < neg_sample_size; j++) {
neg_unchanged[neg_idx + j] = local_unchanged;
neg_eid_data[neg_idx + j] = curr_eid++;
dgl_id_t local_changed = global2local_map(neg_vids[j], &neg_map);
dgl_id_t local_changed = global2local_map(neg_vids[j + prev_neg_offset], &neg_map);
neg_changed[neg_idx + j] = local_changed;
// induced negative eid references to the positive one.
induced_neg_eid_data[neg_idx + j] = induced_eid_data[i];
......@@ -1095,6 +1131,20 @@ NegSubgraph NegEdgeSubgraph(GraphPtr gptr, IdArray relations, const Subgraph &po
neg_subg.graph = GraphPtr(new ImmutableGraph(neg_coo));
neg_subg.induced_vertices = induced_neg_vid;
neg_subg.induced_edges = induced_neg_eid;
// If we didn't sample all nodes to form negative edges, some of the nodes
// in the vector might be redundant.
if (neg_sample_size < num_tot_nodes) {
std::sort(neg_vids.begin(), neg_vids.end());
auto it = std::unique(neg_vids.begin(), neg_vids.end());
neg_vids.resize(it - neg_vids.begin());
}
if (IsNegativeHeadMode(neg_mode)) {
neg_subg.head_nid = aten::VecToIdArray(Global2Local(neg_vids, neg_map));
neg_subg.tail_nid = aten::VecToIdArray(local_pos_vids);
} else {
neg_subg.head_nid = aten::VecToIdArray(local_pos_vids);
neg_subg.tail_nid = aten::VecToIdArray(Global2Local(neg_vids, neg_map));
}
// TODO(zhengda) we should provide an array of 1s if exclude_positive
if (check_false_neg) {
if (relations->shape[0] == 0) {
......@@ -1175,6 +1225,19 @@ NegSubgraph PBGNegEdgeSubgraph(GraphPtr gptr, IdArray relations, const Subgraph
dgl_id_t curr_eid = 0;
std::unordered_map<dgl_id_t, dgl_id_t> neg_map;
dgl_id_t local_vid = 0;
// Collect nodes in the positive side.
std::vector<dgl_id_t> local_pos_vids;
local_pos_vids.reserve(num_pos_edges);
for (int64_t i = 0; i < num_pos_edges; i++) {
dgl_id_t vid = induced_vid_data[unchanged[i]];
auto it = neg_map.find(vid);
if (it == neg_map.end()) {
local_pos_vids.push_back(local_vid);
neg_map.insert(std::pair<dgl_id_t, dgl_id_t>(vid, local_vid++));
}
}
for (int64_t i_chunk = 0; i_chunk < num_chunks; i_chunk++) {
// for each chunk.
int64_t neg_idx = neg_sample_size * chunk_size * i_chunk;
......@@ -1221,6 +1284,13 @@ NegSubgraph PBGNegEdgeSubgraph(GraphPtr gptr, IdArray relations, const Subgraph
neg_subg.graph = GraphPtr(new ImmutableGraph(neg_coo));
neg_subg.induced_vertices = induced_neg_vid;
neg_subg.induced_edges = induced_neg_eid;
if (IsNegativeHeadMode(neg_mode)) {
neg_subg.head_nid = aten::VecToIdArray(Global2Local(neg_vids, neg_map));
neg_subg.tail_nid = aten::VecToIdArray(local_pos_vids);
} else {
neg_subg.head_nid = aten::VecToIdArray(local_pos_vids);
neg_subg.tail_nid = aten::VecToIdArray(Global2Local(neg_vids, neg_map));
}
if (check_false_neg) {
if (relations->shape[0] == 0) {
neg_subg.exist = CheckExistence(gptr, neg_src, neg_dst, induced_neg_vid);
......@@ -1311,4 +1381,18 @@ DGL_REGISTER_GLOBAL("sampling._CAPI_GetNegEdgeExistence")
*rv = gptr->exist;
});
DGL_REGISTER_GLOBAL("sampling._CAPI_GetEdgeSubgraphHead")
.set_body([] (DGLArgs args, DGLRetValue* rv) {
SubgraphRef g = args[0];
auto gptr = std::dynamic_pointer_cast<NegSubgraph>(g.sptr());
*rv = gptr->head_nid;
});
DGL_REGISTER_GLOBAL("sampling._CAPI_GetEdgeSubgraphTail")
.set_body([] (DGLArgs args, DGLRetValue* rv) {
SubgraphRef g = args[0];
auto gptr = std::dynamic_pointer_cast<NegSubgraph>(g.sptr());
*rv = gptr->tail_nid;
});
} // namespace dgl
......@@ -53,8 +53,9 @@ def verify_subgraph(g, subg, seed_id):
assert(is_sorted(child_src))
# a neighbor in the subgraph must also exist in parent graph.
src = F.asnumpy(src)
for i in subg.map_to_parent_nid(child_src):
assert i in src
assert F.asnumpy(i) in src
def test_1neighbor_sampler():
g = generate_rand_graph(100)
......@@ -186,7 +187,7 @@ def test_nonuniform_neighbor_sampler():
assert nf.num_layers == 100
for i in range(nf.num_layers):
assert nf.layer_size(i) == 1
assert nf.layer_parent_nid(i)[0] == i
assert F.asnumpy(nf.layer_parent_nid(i)[0]) == i
# Test the reverse direction
sampler = dgl.contrib.sampling.NeighborSampler(
......@@ -196,7 +197,7 @@ def test_nonuniform_neighbor_sampler():
assert nf.num_layers == 100
for i in range(nf.num_layers):
assert nf.layer_size(i) == 1
assert nf.layer_parent_nid(i)[0] == 99 - i
assert F.asnumpy(nf.layer_parent_nid(i)[0]) == 99 - i
def test_setseed():
g = generate_rand_graph(100)
......@@ -220,6 +221,19 @@ def test_setseed():
g, 5, 3, num_hops=2, neighbor_type='in', num_workers=4)):
pass
def check_head_tail(g):
lsrc, ldst, leid = g.all_edges(form='all', order='eid')
lsrc = np.unique(F.asnumpy(lsrc))
head_nid = np.unique(F.asnumpy(g.head_nid))
assert len(head_nid) == len(g.head_nid)
np.testing.assert_equal(lsrc, head_nid)
ldst = np.unique(F.asnumpy(ldst))
tail_nid = np.unique(F.asnumpy(g.tail_nid))
assert len(tail_nid) == len(g.tail_nid)
np.testing.assert_equal(tail_nid, ldst)
def check_negative_sampler(mode, exclude_positive, neg_size):
g = generate_rand_graph(100)
etype = np.random.randint(0, 10, size=g.number_of_edges(), dtype=np.int64)
......@@ -245,6 +259,7 @@ def check_negative_sampler(mode, exclude_positive, neg_size):
pos_edges.parent_nid[pos_ldst])))
neg_lsrc, neg_ldst, neg_leid = neg_edges.all_edges(form='all', order='eid')
neg_src = neg_edges.parent_nid[neg_lsrc]
neg_dst = neg_edges.parent_nid[neg_ldst]
neg_eid = neg_edges.parent_eid[neg_leid]
......@@ -255,6 +270,13 @@ def check_negative_sampler(mode, exclude_positive, neg_size):
if exclude_positive:
assert int(F.asnumpy(neg_src[i])) != pos_map[(neg_d, neg_e)]
check_head_tail(neg_edges)
pos_tails = pos_edges.parent_nid[pos_edges.tail_nid]
neg_tails = neg_edges.parent_nid[neg_edges.tail_nid]
pos_tails = np.sort(F.asnumpy(pos_tails))
neg_tails = np.sort(F.asnumpy(neg_tails))
np.testing.assert_equal(pos_tails, neg_tails)
exist = neg_edges.edata['false_neg']
if exclude_positive:
assert np.sum(F.asnumpy(exist) == 0) == len(exist)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment