"vscode:/vscode.git/clone" did not exist on "62c827c81c0206cf37ef00ef23560be9ea95b29b"
Unverified Commit 4297e6d6 authored by Da Zheng's avatar Da Zheng Committed by GitHub
Browse files

[Feature] add PBG's negative edge sampling. (#836)

* PBG negative edge sampler.

* add a positive edge to make it regular, handle last batch.

* exclude all positive edges in the parent graph.

* just uniformly sample negative nodes.

* fix lint.

* shuffle one-side nodes of positive edges.

* just uniformly sample negative nodes.

* change the data type.

* address comment.

* remove commented code.
parent 77822769
......@@ -903,10 +903,15 @@ dgl_id_t global2local_map(dgl_id_t global_id,
}
}
Subgraph NegEdgeSubgraph(int64_t num_tot_nodes, const Subgraph &pos_subg,
inline bool is_neg_head_mode(const std::string &mode) {
return mode == "head";
}
Subgraph NegEdgeSubgraph(GraphPtr gptr, const Subgraph &pos_subg,
const std::string &neg_mode,
int neg_sample_size, bool is_multigraph,
bool exclude_positive) {
int neg_sample_size, bool exclude_positive) {
int64_t num_tot_nodes = gptr->NumVertices();
bool is_multigraph = gptr->IsMultigraph();
std::vector<IdArray> adj = pos_subg.graph->GetAdj(false, "coo");
IdArray coo = adj[0];
int64_t num_pos_edges = coo->shape[0] / 2;
......@@ -929,7 +934,6 @@ Subgraph NegEdgeSubgraph(int64_t num_tot_nodes, const Subgraph &pos_subg,
dgl_id_t *neg_eid_data = static_cast<dgl_id_t *>(neg_eid->data);
dgl_id_t *induced_neg_eid_data = static_cast<dgl_id_t *>(induced_neg_eid->data);
bool neg_head = (neg_mode == "head");
dgl_id_t curr_eid = 0;
std::vector<size_t> neg_vids;
neg_vids.reserve(neg_sample_size);
......@@ -943,23 +947,23 @@ Subgraph NegEdgeSubgraph(int64_t num_tot_nodes, const Subgraph &pos_subg,
const dgl_id_t *unchanged;
dgl_id_t *neg_unchanged;
dgl_id_t *neg_changed;
if (neg_head) {
if (is_neg_head_mode(neg_mode)) {
unchanged = dst_data;
neg_unchanged = neg_dst_data;
neg_changed = neg_src_data;
neigh_it = pos_subg.graph->PredVec(unchanged[i]);
neigh_it = gptr->PredVec(induced_vid_data[unchanged[i]]);
} else {
unchanged = src_data;
neg_unchanged = neg_src_data;
neg_changed = neg_dst_data;
neigh_it = pos_subg.graph->SuccVec(unchanged[i]);
neigh_it = gptr->SuccVec(induced_vid_data[unchanged[i]]);
}
if (exclude_positive) {
std::vector<size_t> exclude;
for (auto it = neigh_it.begin(); it != neigh_it.end(); it++) {
dgl_id_t local_vid = *it;
exclude.push_back(induced_vid_data[local_vid]);
dgl_id_t global_vid = *it;
exclude.push_back(global_vid);
}
RandomSample(num_tot_nodes, neg_sample_size, exclude, &neg_vids);
} else {
......@@ -997,6 +1001,122 @@ Subgraph NegEdgeSubgraph(int64_t num_tot_nodes, const Subgraph &pos_subg,
return neg_subg;
}
Subgraph PBGNegEdgeSubgraph(int64_t num_tot_nodes, const Subgraph &pos_subg,
const std::string &neg_mode,
int neg_sample_size, bool is_multigraph,
bool exclude_positive) {
std::vector<IdArray> adj = pos_subg.graph->GetAdj(false, "coo");
IdArray coo = adj[0];
int64_t num_pos_edges = coo->shape[0] / 2;
int64_t chunk_size = neg_sample_size;
// If num_pos_edges isn't divisible by chunk_size, the actual number of chunks
// is num_chunks + 1 and the last chunk size is last_chunk_size.
// Otherwise, the actual number of chunks is num_chunks, the last chunk size
// is 0.
int64_t num_chunks = num_pos_edges / chunk_size;
int64_t last_chunk_size = num_pos_edges - num_chunks * chunk_size;
// The number of negative edges.
int64_t num_neg_edges = neg_sample_size * chunk_size * num_chunks;
int64_t num_neg_edges_last_chunk = neg_sample_size * last_chunk_size;
int64_t num_all_neg_edges = num_neg_edges + num_neg_edges_last_chunk;
// We should include the last chunk.
if (last_chunk_size > 0)
num_chunks++;
IdArray neg_dst = IdArray::Empty({num_all_neg_edges}, coo->dtype, coo->ctx);
IdArray neg_src = IdArray::Empty({num_all_neg_edges}, coo->dtype, coo->ctx);
IdArray neg_eid = IdArray::Empty({num_all_neg_edges}, coo->dtype, coo->ctx);
IdArray induced_neg_eid = IdArray::Empty({num_all_neg_edges}, coo->dtype, coo->ctx);
// These are vids in the positive subgraph.
const dgl_id_t *dst_data = static_cast<const dgl_id_t *>(coo->data);
const dgl_id_t *src_data = static_cast<const dgl_id_t *>(coo->data) + num_pos_edges;
const dgl_id_t *induced_vid_data = static_cast<const dgl_id_t *>(pos_subg.induced_vertices->data);
const dgl_id_t *induced_eid_data = static_cast<const dgl_id_t *>(pos_subg.induced_edges->data);
size_t num_pos_nodes = pos_subg.graph->NumVertices();
std::vector<size_t> pos_nodes(induced_vid_data, induced_vid_data + num_pos_nodes);
dgl_id_t *neg_dst_data = static_cast<dgl_id_t *>(neg_dst->data);
dgl_id_t *neg_src_data = static_cast<dgl_id_t *>(neg_src->data);
dgl_id_t *neg_eid_data = static_cast<dgl_id_t *>(neg_eid->data);
dgl_id_t *induced_neg_eid_data = static_cast<dgl_id_t *>(induced_neg_eid->data);
const dgl_id_t *unchanged;
dgl_id_t *neg_unchanged;
dgl_id_t *neg_changed;
// corrupt head nodes.
if (is_neg_head_mode(neg_mode)) {
unchanged = dst_data;
neg_unchanged = neg_dst_data;
neg_changed = neg_src_data;
} else {
// corrupt tail nodes.
unchanged = src_data;
neg_unchanged = neg_src_data;
neg_changed = neg_dst_data;
}
// We first sample all negative edges.
std::vector<size_t> neg_vids;
RandomSample(num_tot_nodes,
num_chunks * neg_sample_size,
&neg_vids);
dgl_id_t curr_eid = 0;
std::unordered_map<dgl_id_t, dgl_id_t> neg_map;
for (int64_t i_chunk = 0; i_chunk < num_chunks; i_chunk++) {
// for each chunk.
int64_t neg_idx = neg_sample_size * chunk_size * i_chunk;
int64_t pos_edge_idx = chunk_size * i_chunk;
int64_t neg_node_idx = neg_sample_size * i_chunk;
// The actual chunk size. It'll be different for the last chunk.
int64_t chunk_size1;
if (i_chunk == num_chunks - 1 && last_chunk_size > 0)
chunk_size1 = last_chunk_size;
else
chunk_size1 = chunk_size;
for (int64_t in_chunk = 0; in_chunk != chunk_size1; ++in_chunk) {
// For each positive node in a chunk.
dgl_id_t global_unchanged = induced_vid_data[unchanged[pos_edge_idx + in_chunk]];
dgl_id_t local_unchanged = global2local_map(global_unchanged, &neg_map);
for (int64_t j = 0; j < neg_sample_size; ++j) {
neg_unchanged[neg_idx] = local_unchanged;
neg_eid_data[neg_idx] = curr_eid++;
dgl_id_t global_changed_vid = neg_vids[neg_node_idx + j];
// TODO(zhengda) we can avoid the hashtable lookup here.
dgl_id_t local_changed = global2local_map(global_changed_vid, &neg_map);
neg_changed[neg_idx] = local_changed;
induced_neg_eid_data[neg_idx] = induced_eid_data[pos_edge_idx + in_chunk];
neg_idx++;
}
}
}
// Now we know the number of vertices in the negative graph.
int64_t num_neg_nodes = neg_map.size();
IdArray induced_neg_vid = IdArray::Empty({num_neg_nodes}, coo->dtype, coo->ctx);
dgl_id_t *induced_neg_vid_data = static_cast<dgl_id_t *>(induced_neg_vid->data);
for (auto it = neg_map.begin(); it != neg_map.end(); it++) {
induced_neg_vid_data[it->second] = it->first;
}
Subgraph neg_subg;
// We sample negative vertices without replacement.
// There shouldn't be duplicated edges.
COOPtr neg_coo(new COO(num_neg_nodes, neg_src, neg_dst, is_multigraph));
neg_subg.graph = GraphPtr(new ImmutableGraph(neg_coo));
neg_subg.induced_vertices = induced_neg_vid;
neg_subg.induced_edges = induced_neg_eid;
return neg_subg;
}
inline SubgraphRef ConvertRef(const Subgraph &subg) {
return SubgraphRef(std::shared_ptr<Subgraph>(new Subgraph(subg)));
}
......@@ -1042,11 +1162,17 @@ DGL_REGISTER_GLOBAL("sampling._CAPI_UniformEdgeSampling")
Subgraph subg = gptr->EdgeSubgraph(worker_seeds, false);
positive_subgs[i] = ConvertRef(subg);
if (neg_mode.size() > 0) {
Subgraph neg_subg = NegEdgeSubgraph(gptr->NumVertices(), subg,
neg_mode, neg_sample_size,
// For PBG negative sampling, we accept "PBG-head" for corrupting head
// nodes and "PBG-tail" for corrupting tail nodes.
if (neg_mode.substr(0, 3) == "PBG") {
Subgraph neg_subg = PBGNegEdgeSubgraph(gptr->NumVertices(), subg,
neg_mode.substr(4), neg_sample_size,
gptr->IsMultigraph(), exclude_positive);
negative_subgs[i] = ConvertRef(neg_subg);
} else if (neg_mode.size() > 0) {
Subgraph neg_subg = NegEdgeSubgraph(gptr, subg, neg_mode, neg_sample_size,
exclude_positive);
negative_subgs[i] = ConvertRef(neg_subg);
}
}
if (neg_mode.size() > 0) {
......
......@@ -220,14 +220,22 @@ def test_setseed():
g, 5, 3, num_hops=2, neighbor_type='in', num_workers=4)):
pass
def test_negative_sampler():
def check_negative_sampler(mode, exclude_positive):
g = generate_rand_graph(100)
pos_gsrc, pos_gdst, pos_geid = g.all_edges(form='all', order='eid')
pos_map = {}
for i in range(len(pos_geid)):
pos_d = int(F.asnumpy(pos_gdst[i]))
pos_e = int(F.asnumpy(pos_geid[i]))
pos_map[(pos_d, pos_e)] = int(F.asnumpy(pos_gsrc[i]))
EdgeSampler = getattr(dgl.contrib.sampling, 'EdgeSampler')
neg_size = 10
for pos_edges, neg_edges in EdgeSampler(g, 50,
negative_mode="head",
neg_sample_size=10,
exclude_positive=True):
assert 10 * pos_edges.number_of_edges() == neg_edges.number_of_edges()
negative_mode=mode,
neg_sample_size=neg_size,
exclude_positive=exclude_positive):
pos_nid = pos_edges.parent_nid
pos_eid = pos_edges.parent_eid
pos_lsrc, pos_ldst, pos_leid = pos_edges.all_edges(form='all', order='eid')
......@@ -236,12 +244,6 @@ def test_negative_sampler():
pos_eid = pos_eid[pos_leid]
assert_array_equal(F.asnumpy(pos_eid), F.asnumpy(g.edge_ids(pos_src, pos_dst)))
pos_map = {}
for i in range(len(pos_eid)):
pos_d = int(F.asnumpy(pos_dst[i]))
pos_e = int(F.asnumpy(pos_eid[i]))
pos_map[(pos_d, pos_e)] = int(F.asnumpy(pos_src[i]))
neg_lsrc, neg_ldst, neg_leid = neg_edges.all_edges(form='all', order='eid')
neg_nid = neg_edges.parent_nid
neg_eid = neg_edges.parent_eid
......@@ -253,8 +255,13 @@ def test_negative_sampler():
neg_d = int(F.asnumpy(neg_dst[i]))
neg_e = int(F.asnumpy(neg_eid[i]))
assert (neg_d, neg_e) in pos_map
if exclude_positive:
assert int(F.asnumpy(neg_src[i])) != pos_map[(neg_d, neg_e)]
def test_negative_sampler():
check_negative_sampler('head', True)
check_negative_sampler('PBG-head', False)
if __name__ == '__main__':
test_create_full()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment