"...git@developer.sourcefind.cn:renzhc/diffusers_dcu.git" did not exist on "e71f73d8df3c04350a3caa9ba5714bef6422fe87"
Unverified Commit 2e1cbd5d authored by Quan (Andy) Gan's avatar Quan (Andy) Gan Committed by GitHub
Browse files

[Feature] Random walk traces generation (#392)

* random walk traces generation

* remove outdated comments

* oops put in the wrong place

* explicit inline

* moving rand_r to util

* moving random walk to public function

* per-thread seed and openmp support

* type cast styles
parent f370e628
...@@ -8,6 +8,14 @@ ...@@ -8,6 +8,14 @@
#include "c_runtime_api.h" #include "c_runtime_api.h"
#ifdef _MSC_VER
// rand in MS compiler works well in multi-threading.
static inline int rand_r(unsigned *seed) {
return rand();
}
#define _CRT_RAND_S
#endif
namespace dgl { namespace dgl {
namespace runtime { namespace runtime {
......
...@@ -57,6 +57,19 @@ class SamplerOp { ...@@ -57,6 +57,19 @@ class SamplerOp {
static NodeFlow NeighborUniformSample(const ImmutableGraph *graph, IdArray seeds, static NodeFlow NeighborUniformSample(const ImmutableGraph *graph, IdArray seeds,
const std::string &edge_type, const std::string &edge_type,
int num_hops, int expand_factor); int num_hops, int expand_factor);
/*!
* \brief Batch-generate random walk traces
* \param seeds The array of starting vertex IDs
* \param num_traces The number of traces to generate for each seed
* \param num_hops The number of hops for each trace
* \return a flat ID array with shape (num_seeds, num_traces, num_hops + 1)
*/
// TODO: move to sampler.cc
static IdArray RandomWalk(const GraphInterface *gptr,
IdArray seeds,
int num_traces,
int num_hops);
}; };
} // namespace dgl } // namespace dgl
......
from .sampler import NeighborSampler from .sampler import NeighborSampler
from .randomwalk import *
from ... import utils
from ... import backend as F
__all__ = ['random_walk']
def random_walk(g, seeds, num_traces, num_hops):
"""Batch-generate random walk traces on given graph with the same length.
Parameters
----------
g : DGLGraph
The graph. Must be readonly.
seeds : Tensor
The node ID tensor from which the random walk traces starts.
num_traces : int
Number of traces to generate for each seed.
num_hops : int
Number of hops for each trace.
Returns
-------
traces : Tensor
A 3-dimensional node ID tensor with shape
(num_seeds, num_traces, num_hops + 1)
traces[i, j, 0] are always starting nodes (i.e. seed[i]).
"""
return g._graph.random_walk(utils.toindex(seeds), num_traces, num_hops)
...@@ -680,6 +680,20 @@ class GraphIndex(object): ...@@ -680,6 +680,20 @@ class GraphIndex(object):
utils.toindex(rst(num_subgs * 3 + i)), utils.toindex(rst(num_subgs * 3 + i)),
utils.toindex(rst(num_subgs * 4 + i))) for i in range(num_subgs)] utils.toindex(rst(num_subgs * 4 + i))) for i in range(num_subgs)]
def random_walk(self, seeds, num_traces, num_hops):
"""Random walk sampling.
Returns a user Tensor of random walk traces with shape
(num_seeds, num_traces, num_hops + 1)
"""
if len(seeds) == 0:
return utils.toindex([])
seeds = seeds.todgltensor()
traces = _CAPI_DGLGraphRandomWalk(self._handle, seeds, num_traces, num_hops)
return F.zerocopy_from_dlpack(traces.to_dlpack())
def to_networkx(self): def to_networkx(self):
"""Convert to networkx graph. """Convert to networkx graph.
......
...@@ -483,4 +483,15 @@ DGL_REGISTER_GLOBAL("graph_index._CAPI_DGLGraphGetAdj") ...@@ -483,4 +483,15 @@ DGL_REGISTER_GLOBAL("graph_index._CAPI_DGLGraphGetAdj")
*rv = ConvertAdjToPackedFunc(res); *rv = ConvertAdjToPackedFunc(res);
}); });
DGL_REGISTER_GLOBAL("graph_index._CAPI_DGLGraphRandomWalk")
.set_body([] (DGLArgs args, DGLRetValue* rv) {
GraphHandle ghandle = args[0];
const IdArray seeds = IdArray::FromDLPack(CreateTmpDLManagedTensor(args[1]));
const int num_traces = args[2];
const int num_hops = args[3];
const GraphInterface *ptr = static_cast<const GraphInterface *>(ghandle);
*rv = SamplerOp::RandomWalk(ptr, seeds, num_traces, num_hops);
});
} // namespace dgl } // namespace dgl
...@@ -5,8 +5,6 @@ ...@@ -5,8 +5,6 @@
*/ */
#include <dgl/immutable_graph.h> #include <dgl/immutable_graph.h>
#include <cstdlib>
#include <cmath>
#ifdef _MSC_VER #ifdef _MSC_VER
#define _CRT_RAND_S #define _CRT_RAND_S
......
...@@ -7,6 +7,9 @@ ...@@ -7,6 +7,9 @@
#include <dgl/sampler.h> #include <dgl/sampler.h>
#include <dgl/immutable_graph.h> #include <dgl/immutable_graph.h>
#include <algorithm> #include <algorithm>
#include <cstdlib>
#include <cmath>
#include <omp.h>
#ifdef _MSC_VER #ifdef _MSC_VER
// rand in MS compiler works well in multi-threading. // rand in MS compiler works well in multi-threading.
...@@ -305,7 +308,7 @@ NodeFlow ConstructNodeFlow(std::vector<dgl_id_t> neighbor_list, ...@@ -305,7 +308,7 @@ NodeFlow ConstructNodeFlow(std::vector<dgl_id_t> neighbor_list,
} }
layer_off_data[0] = 0; layer_off_data[0] = 0;
layer_off_data[1] = layer_offsets[num_hops] - layer_offsets[num_hops - 1]; layer_off_data[1] = layer_offsets[num_hops] - layer_offsets[num_hops - 1];
size_t out_layer_idx = 1; int out_layer_idx = 1;
for (int layer_id = num_hops - 2; layer_id >= 0; layer_id--) { for (int layer_id = num_hops - 2; layer_id >= 0; layer_id--) {
std::sort(neigh_pos->begin() + layer_offsets[layer_id], std::sort(neigh_pos->begin() + layer_offsets[layer_id],
neigh_pos->begin() + layer_offsets[layer_id + 1], neigh_pos->begin() + layer_offsets[layer_id + 1],
...@@ -345,14 +348,14 @@ NodeFlow ConstructNodeFlow(std::vector<dgl_id_t> neighbor_list, ...@@ -345,14 +348,14 @@ NodeFlow ConstructNodeFlow(std::vector<dgl_id_t> neighbor_list,
// Copy flow offsets. // Copy flow offsets.
flow_off_data[0] = 0; flow_off_data[0] = 0;
size_t out_flow_idx = 0; int out_flow_idx = 0;
for (int i = 0; i < layer_offsets.size() - 2; i++) { for (size_t i = 0; i < layer_offsets.size() - 2; i++) {
size_t num_edges = subg_csr->GetDegree(layer_off_data[i + 1], layer_off_data[i + 2]); size_t num_edges = subg_csr->GetDegree(layer_off_data[i + 1], layer_off_data[i + 2]);
flow_off_data[out_flow_idx + 1] = flow_off_data[out_flow_idx] + num_edges; flow_off_data[out_flow_idx + 1] = flow_off_data[out_flow_idx] + num_edges;
out_flow_idx++; out_flow_idx++;
} }
CHECK(out_flow_idx == num_hops - 1); CHECK(out_flow_idx == num_hops - 1);
CHECK(flow_off_data[num_hops - 1] == num_edges); CHECK(flow_off_data[num_hops - 1] == static_cast<uint64_t>(num_edges));
for (size_t i = 0; i < subg_csr->edge_ids.size(); i++) { for (size_t i = 0; i < subg_csr->edge_ids.size(); i++) {
subg_csr->edge_ids[i] = i; subg_csr->edge_ids[i] = i;
...@@ -404,7 +407,7 @@ NodeFlow SampleSubgraph(const ImmutableGraph *graph, ...@@ -404,7 +407,7 @@ NodeFlow SampleSubgraph(const ImmutableGraph *graph,
layer_offsets[0] = 0; layer_offsets[0] = 0;
layer_offsets[1] = sub_vers.size(); layer_offsets[1] = sub_vers.size();
for (size_t layer_id = 1; layer_id < num_hops; layer_id++) { for (int layer_id = 1; layer_id < num_hops; layer_id++) {
// We need to avoid resampling the same node in a layer, but we allow a node // We need to avoid resampling the same node in a layer, but we allow a node
// to be resampled in multiple layers. We use `sub_ver_map` to keep track of // to be resampled in multiple layers. We use `sub_ver_map` to keep track of
// sampled nodes in a layer, and clear it when entering a new layer. // sampled nodes in a layer, and clear it when entering a new layer.
...@@ -479,4 +482,45 @@ NodeFlow SamplerOp::NeighborUniformSample(const ImmutableGraph *graph, IdArray s ...@@ -479,4 +482,45 @@ NodeFlow SamplerOp::NeighborUniformSample(const ImmutableGraph *graph, IdArray s
expand_factor); expand_factor);
} }
IdArray SamplerOp::RandomWalk(
const GraphInterface *gptr,
IdArray seeds,
int num_traces,
int num_hops) {
const int num_nodes = seeds->shape[0];
const dgl_id_t *seed_ids = static_cast<dgl_id_t *>(seeds->data);
IdArray traces = IdArray::Empty(
{num_nodes, num_traces, num_hops + 1},
DLDataType{kDLInt, 64, 1},
DLContext{kDLCPU, 0});
dgl_id_t *trace_data = static_cast<dgl_id_t *>(traces->data);
#pragma omp parallel
{
// get per-thread seed
unsigned int random_seed = time(nullptr) ^ omp_get_thread_num();
#pragma omp for
for (int i = 0; i < num_nodes; ++i) {
const dgl_id_t seed_id = seed_ids[i];
for (int j = 0; j < num_traces; ++j) {
dgl_id_t cur = seed_id;
const int kmax = num_hops + 1;
for (int k = 0; k < kmax; ++k) {
const size_t offset = ((size_t)i * num_traces + j) * kmax + k;
trace_data[offset] = cur;
const auto succ = gptr->SuccVec(cur);
const size_t size = succ.size();
cur = succ[rand_r(&random_seed) % size];
}
}
}
}
return traces;
}
} // namespace dgl } // namespace dgl
...@@ -99,8 +99,29 @@ def test_10neighbor_sampler(): ...@@ -99,8 +99,29 @@ def test_10neighbor_sampler():
check_10neighbor_sampler(g, seeds=np.unique(np.random.randint(0, g.number_of_nodes(), check_10neighbor_sampler(g, seeds=np.unique(np.random.randint(0, g.number_of_nodes(),
size=int(g.number_of_nodes() / 10)))) size=int(g.number_of_nodes() / 10))))
def test_random_walk():
edge_list = [(0, 1), (1, 2), (2, 3), (3, 4),
(4, 3), (3, 2), (2, 1), (1, 0)]
seeds = [0, 1]
n_traces = 3
n_hops = 4
g = dgl.DGLGraph(edge_list, readonly=True)
traces = dgl.contrib.sampling.random_walk(g, seeds, n_traces, n_hops)
traces = F.zerocopy_to_numpy(traces)
assert traces.shape == (len(seeds), n_traces, n_hops + 1)
for i, seed in enumerate(seeds):
assert (traces[i, :, 0] == seeds[i]).all()
trace_diff = np.diff(traces, axis=-1)
# only nodes with adjacent IDs are connected
assert (np.abs(trace_diff) == 1).all()
if __name__ == '__main__': if __name__ == '__main__':
test_1neighbor_sampler_all() test_1neighbor_sampler_all()
test_10neighbor_sampler_all() test_10neighbor_sampler_all()
test_1neighbor_sampler() test_1neighbor_sampler()
test_10neighbor_sampler() test_10neighbor_sampler()
test_random_walk()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment