Unverified Commit a9dabcc7 authored by Quan (Andy) Gan's avatar Quan (Andy) Gan Committed by GitHub
Browse files

[Feature] Random Walk for 0.5 (#1209)

* trying to refactor IndexSelect

* partial implementation

* add index select and assign for floats as well

* move to random choice source

* more updates

* fixes

* fixes

* more fixes

* adding python impl

* fixes

* unit test

* lint

* lint x2

* lint x3

* update metapath2vec

* debugging performance

* still debugging for performance

* tuning

* switching to succvec

* redo

* revert non-uniform sampler to use vector

* still not fast

* why does this crash with OpenMP???

* because there was a data race!!!

* add documentations and remove assign op

* lint

* lint x2

* lol what have i done

* lint x3

* fix and disable gpu testing

* bugfix

* generic random walk

* reorg the random walk source code

* Update randomwalks.h

* Update randomwalks_cpu.cc

* rename file

* move internal function to anonymous ns

* reorg & docstrings

* constant restart probability

* docstring fix

* more commit

* random walk with restart, tested

* some fixes

* switch to using NDArray for choice

* massive fix & docstring

* lint x?

* lint x??

* fix

* export symbols

* skip gpu test

* addresses comments

* replaces another VecToIdArray

* add randomwalks.h to include

* replace void * with template
parent 5967d817
......@@ -4,6 +4,7 @@
* \brief DGL sampler implementation
*/
#include <dgl/sampler.h>
#include <dgl/array.h>
#include <dgl/immutable_graph.h>
#include <dgl/runtime/container.h>
#include <dgl/packed_func_ext.h>
......@@ -14,7 +15,6 @@
#include <cmath>
#include <numeric>
#include "../c_api_common.h"
#include "../array/common.h" // for ATEN_FLOAT_TYPE_SWITCH
using namespace dgl::runtime;
......
## DGL Sampler
This directory contains the implementations for graph sampling routines in 0.5+.
### Code Hierarchy
#### Random walks:
* `randomwalks.h:`
* `randomwalks_cpu.h:GenericRandomWalk(hg, seeds, max_num_steps, step)`
* `metapath_randomwalk.h:RandomWalk(hg, seeds, metapath, prob, terminate)`
/*!
* Copyright (c) 2018 by Contributors
* \file graph/sampling/get_node_types_cpu.cc
* \brief DGL sampler - CPU implementation of random walks with OpenMP
*/
#include <dgl/array.h>
#include <dgl/base_heterograph.h>
#include <utility>
#include "randomwalks_impl.h"
namespace dgl {
using namespace dgl::runtime;
using namespace dgl::aten;
namespace sampling {
namespace impl {
template<DLDeviceType XPU, typename IdxType>
TypeArray GetNodeTypesFromMetapath(
const HeteroGraphPtr hg,
const TypeArray metapath) {
uint64_t num_etypes = metapath->shape[0];
TypeArray result = TypeArray::Empty(
{metapath->shape[0] + 1}, metapath->dtype, metapath->ctx);
const IdxType *metapath_data = static_cast<IdxType *>(metapath->data);
IdxType *result_data = static_cast<IdxType *>(result->data);
dgl_type_t curr_type = hg->GetEndpointTypes(metapath_data[0]).first;
result_data[0] = curr_type;
for (uint64_t i = 0; i < num_etypes; ++i) {
auto src_dst_type = hg->GetEndpointTypes(metapath_data[i]);
dgl_type_t srctype = src_dst_type.first;
dgl_type_t dsttype = src_dst_type.second;
if (srctype != curr_type) {
LOG(FATAL) << "source of edge type #" << i <<
" does not match destination of edge type #" << i - 1;
return result;
}
curr_type = dsttype;
result_data[i + 1] = dsttype;
}
return result;
}
template
TypeArray GetNodeTypesFromMetapath<kDLCPU, int32_t>(
const HeteroGraphPtr hg,
const TypeArray metapath);
template
TypeArray GetNodeTypesFromMetapath<kDLCPU, int64_t>(
const HeteroGraphPtr hg,
const TypeArray metapath);
}; // namespace impl
}; // namespace sampling
}; // namespace dgl
/*!
* Copyright (c) 2018 by Contributors
* \file graph/sampler/generic_randomwalk_cpu.h
* \brief DGL sampler - templated implementation definition of random walks on CPU
*/
#ifndef DGL_GRAPH_SAMPLING_METAPATH_RANDOMWALK_H_
#define DGL_GRAPH_SAMPLING_METAPATH_RANDOMWALK_H_
#include <dgl/array.h>
#include <dgl/base_heterograph.h>
#include <dgl/random.h>
#include <utility>
#include <vector>
#include "randomwalks_impl.h"
#include "randomwalks_cpu.h"
namespace dgl {
using namespace dgl::runtime;
using namespace dgl::aten;
namespace sampling {
namespace impl {
namespace {
// bool WhetherToTerminate(
// IdxType *node_ids_generated_so_far,
// dgl_id_t last_node_id_generated,
// int64_t number_of_nodes_generated_so_far)
template<typename IdxType>
using TerminatePredicate = std::function<bool(IdxType *, dgl_id_t, int64_t)>;
/*!
* \brief Select one successor of metapath-based random walk, given the path generated
* so far.
*
* \param data The path generated so far, of type \c IdxType.
* \param curr The last node ID generated.
* \param len The number of nodes generated so far. Note that the seed node is always
* included as \c data[0], and the successors start from \c data[1].
*
* \param edges_by_type Vector of results from \c GetAdj() by edge type.
* \param metapath_data Edge types of given metapath.
* \param prob Transition probability per edge type.
* \param terminate Predicate for terminating the current random walk path.
*
* \return A pair of ID of next successor (-1 if not exist), as well as whether to terminate.
*/
template<DLDeviceType XPU, typename IdxType>
std::pair<dgl_id_t, bool> MetapathRandomWalkStep(
IdxType *data,
dgl_id_t curr,
int64_t len,
const std::vector<std::vector<IdArray> > &edges_by_type,
const IdxType *metapath_data,
const std::vector<FloatArray> &prob,
TerminatePredicate<IdxType> terminate) {
dgl_type_t etype = metapath_data[len];
// Note that since the selection of successors is very lightweight (especially in the
// uniform case), we want to reduce the overheads (even from object copies or object
// construction) as much as possible.
// Using Successors() slows down by 2x.
// Using OutEdges() slows down by 10x.
const std::vector<NDArray> &csr_arrays = edges_by_type[etype];
const IdxType *offsets = static_cast<IdxType *>(csr_arrays[0]->data);
const IdxType *all_succ = static_cast<IdxType *>(csr_arrays[1]->data);
const IdxType *succ = all_succ + offsets[curr];
const int64_t size = offsets[curr + 1] - offsets[curr];
if (size == 0)
return std::make_pair(-1, true);
FloatArray prob_etype = prob[etype];
IdxType idx;
if (prob_etype->shape[0] == 0) {
// empty probability array; assume uniform
idx = RandomEngine::ThreadLocal()->RandInt(size);
} else {
// non-uniform random walk
const IdxType *all_eids = static_cast<IdxType *>(csr_arrays[2]->data);
const IdxType *eids = all_eids + offsets[curr];
ATEN_FLOAT_TYPE_SWITCH(prob_etype->dtype, DType, "probability", {
FloatArray prob_selected = FloatArray::Empty({size}, prob_etype->dtype, prob_etype->ctx);
DType *prob_selected_data = static_cast<DType *>(prob_selected->data);
const DType *prob_etype_data = static_cast<DType *>(prob_etype->data);
for (int64_t j = 0; j < size; ++j)
prob_selected_data[j] = prob_etype_data[eids[j]];
idx = RandomEngine::ThreadLocal()->Choice<IdxType>(prob_selected);
});
}
curr = succ[idx];
return std::make_pair(curr, terminate(data, curr, len));
}
/*!
* \brief Metapath-based random walk.
* \param hg The heterograph.
* \param seeds A 1D array of seed nodes, with the type the source type of the first
* edge type in the metapath.
* \param metapath A 1D array of edge types representing the metapath.
* \param prob A vector of 1D float arrays, indicating the transition probability of
* each edge by edge type. An empty float array assumes uniform transition.
* \param terminate Predicate for terminating a random walk path.
* \return A 2D array of shape (len(seeds), len(metapath) + 1) with node IDs.
*/
template<DLDeviceType XPU, typename IdxType>
IdArray MetapathBasedRandomWalk(
const HeteroGraphPtr hg,
const IdArray seeds,
const TypeArray metapath,
const std::vector<FloatArray> &prob,
TerminatePredicate<IdxType> terminate) {
int64_t max_num_steps = metapath->shape[0];
const IdxType *metapath_data = static_cast<IdxType *>(metapath->data);
// Prefetch all edges.
// This forces the heterograph to materialize all OutCSR's before the OpenMP loop;
// otherwise data races will happen.
// TODO(BarclayII): should we later on materialize COO/CSR/CSC anyway unless told otherwise?
std::vector<std::vector<IdArray> > edges_by_type;
for (dgl_type_t etype = 0; etype < hg->NumEdgeTypes(); ++etype)
edges_by_type.push_back(hg->GetAdj(etype, true, "csr"));
StepFunc<IdxType> step =
[&edges_by_type, metapath_data, &prob, terminate]
(IdxType *data, dgl_id_t curr, int64_t len) {
return MetapathRandomWalkStep<XPU, IdxType>(
data, curr, len, edges_by_type, metapath_data, prob, terminate);
};
return GenericRandomWalk<XPU, IdxType>(seeds, max_num_steps, step);
}
}; // namespace
}; // namespace impl
}; // namespace sampling
}; // namespace dgl
#endif // DGL_GRAPH_SAMPLING_METAPATH_RANDOMWALK_H_
/*!
* Copyright (c) 2018 by Contributors
* \file graph/sampling/randomwalk_cpu.cc
* \brief DGL sampler - CPU implementation of metapath-based random walk with OpenMP
*/
#include <dgl/array.h>
#include <dgl/base_heterograph.h>
#include <vector>
#include "randomwalks_impl.h"
#include "randomwalks_cpu.h"
#include "metapath_randomwalk.h"
namespace dgl {
using namespace dgl::runtime;
using namespace dgl::aten;
namespace sampling {
namespace impl {
template<DLDeviceType XPU, typename IdxType>
IdArray RandomWalk(
const HeteroGraphPtr hg,
const IdArray seeds,
const TypeArray metapath,
const std::vector<FloatArray> &prob) {
TerminatePredicate<IdxType> terminate =
[] (IdxType *data, dgl_id_t curr, int64_t len) {
return false;
};
return MetapathBasedRandomWalk<XPU, IdxType>(hg, seeds, metapath, prob, terminate);
}
template
IdArray RandomWalk<kDLCPU, int32_t>(
const HeteroGraphPtr hg,
const IdArray seeds,
const TypeArray metapath,
const std::vector<FloatArray> &prob);
template
IdArray RandomWalk<kDLCPU, int64_t>(
const HeteroGraphPtr hg,
const IdArray seeds,
const TypeArray metapath,
const std::vector<FloatArray> &prob);
}; // namespace impl
}; // namespace sampling
}; // namespace dgl
/*!
* Copyright (c) 2018 by Contributors
* \file graph/sampling/randomwalk_with_restart_cpu.cc
* \brief DGL sampler - CPU implementation of metapath-based random walk with restart with OpenMP
*/
#include <dgl/array.h>
#include <dgl/base_heterograph.h>
#include <dgl/random.h>
#include <utility>
#include <vector>
#include "randomwalks_impl.h"
#include "randomwalks_cpu.h"
#include "metapath_randomwalk.h"
namespace dgl {
using namespace dgl::runtime;
using namespace dgl::aten;
namespace sampling {
namespace impl {
template<DLDeviceType XPU, typename IdxType>
IdArray RandomWalkWithRestart(
const HeteroGraphPtr hg,
const IdArray seeds,
const TypeArray metapath,
const std::vector<FloatArray> &prob,
double restart_prob) {
TerminatePredicate<IdxType> terminate =
[restart_prob] (IdxType *data, dgl_id_t curr, int64_t len) {
return RandomEngine::ThreadLocal()->Uniform<double>() < restart_prob;
};
return MetapathBasedRandomWalk<XPU, IdxType>(hg, seeds, metapath, prob, terminate);
}
template
IdArray RandomWalkWithRestart<kDLCPU, int32_t>(
const HeteroGraphPtr hg,
const IdArray seeds,
const TypeArray metapath,
const std::vector<FloatArray> &prob,
double restart_prob);
template
IdArray RandomWalkWithRestart<kDLCPU, int64_t>(
const HeteroGraphPtr hg,
const IdArray seeds,
const TypeArray metapath,
const std::vector<FloatArray> &prob,
double restart_prob);
template<DLDeviceType XPU, typename IdxType>
IdArray RandomWalkWithStepwiseRestart(
const HeteroGraphPtr hg,
const IdArray seeds,
const TypeArray metapath,
const std::vector<FloatArray> &prob,
FloatArray restart_prob) {
IdArray result;
ATEN_FLOAT_TYPE_SWITCH(restart_prob->dtype, DType, "restart probability", {
DType *restart_prob_data = static_cast<DType *>(restart_prob->data);
TerminatePredicate<IdxType> terminate =
[restart_prob_data] (IdxType *data, dgl_id_t curr, int64_t len) {
return RandomEngine::ThreadLocal()->Uniform<DType>() < restart_prob_data[len];
};
result = MetapathBasedRandomWalk<XPU, IdxType>(hg, seeds, metapath, prob, terminate);
});
return result;
}
template
IdArray RandomWalkWithStepwiseRestart<kDLCPU, int32_t>(
const HeteroGraphPtr hg,
const IdArray seeds,
const TypeArray metapath,
const std::vector<FloatArray> &prob,
FloatArray restart_prob);
template
IdArray RandomWalkWithStepwiseRestart<kDLCPU, int64_t>(
const HeteroGraphPtr hg,
const IdArray seeds,
const TypeArray metapath,
const std::vector<FloatArray> &prob,
FloatArray restart_prob);
}; // namespace impl
}; // namespace sampling
}; // namespace dgl
/*!
* Copyright (c) 2018 by Contributors
* \file graph/sampler/randomwalks.cc
* \brief Dispatcher of different DGL random walks by device type
*/
#include <dgl/runtime/container.h>
#include <dgl/packed_func_ext.h>
#include <dgl/array.h>
#include <dgl/sampling/randomwalks.h>
#include <utility>
#include <tuple>
#include <vector>
#include "../../c_api_common.h"
#include "randomwalks_impl.h"
using namespace dgl::runtime;
using namespace dgl::aten;
namespace dgl {
namespace sampling {
namespace {
void CheckRandomWalkInputs(
const HeteroGraphPtr hg,
const IdArray seeds,
const TypeArray metapath,
const std::vector<FloatArray> &prob) {
CHECK_INT(seeds, "seeds");
CHECK_INT(metapath, "metapath");
CHECK_NDIM(seeds, 1, "seeds");
CHECK_NDIM(metapath, 1, "metapath");
for (uint64_t i = 0; i < prob.size(); ++i) {
FloatArray p = prob[i];
CHECK_FLOAT(p, "probability");
if (p.GetSize() == 0)
CHECK_NDIM(p, 1, "probability");
}
}
}; // namespace
std::pair<IdArray, TypeArray> RandomWalk(
const HeteroGraphPtr hg,
const IdArray seeds,
const TypeArray metapath,
const std::vector<FloatArray> &prob) {
CheckRandomWalkInputs(hg, seeds, metapath, prob);
TypeArray vtypes;
IdArray vids;
ATEN_XPU_SWITCH(hg->Context().device_type, XPU, {
ATEN_ID_TYPE_SWITCH(seeds->dtype, IdxType, {
vtypes = impl::GetNodeTypesFromMetapath<XPU, IdxType>(hg, metapath);
vids = impl::RandomWalk<XPU, IdxType>(hg, seeds, metapath, prob);
});
});
return std::make_pair(vids, vtypes);
}
std::pair<IdArray, TypeArray> RandomWalkWithRestart(
const HeteroGraphPtr hg,
const IdArray seeds,
const TypeArray metapath,
const std::vector<FloatArray> &prob,
double restart_prob) {
CheckRandomWalkInputs(hg, seeds, metapath, prob);
CHECK(restart_prob >= 0 && restart_prob < 1) << "restart probability must belong to [0, 1)";
TypeArray vtypes;
IdArray vids;
ATEN_XPU_SWITCH(hg->Context().device_type, XPU, {
ATEN_ID_TYPE_SWITCH(seeds->dtype, IdxType, {
vtypes = impl::GetNodeTypesFromMetapath<XPU, IdxType>(hg, metapath);
vids = impl::RandomWalkWithRestart<XPU, IdxType>(hg, seeds, metapath, prob, restart_prob);
});
});
return std::make_pair(vids, vtypes);
}
std::pair<IdArray, TypeArray> RandomWalkWithStepwiseRestart(
const HeteroGraphPtr hg,
const IdArray seeds,
const TypeArray metapath,
const std::vector<FloatArray> &prob,
FloatArray restart_prob) {
CheckRandomWalkInputs(hg, seeds, metapath, prob);
// TODO(BarclayII): check the elements of restart probability
TypeArray vtypes;
IdArray vids;
ATEN_XPU_SWITCH(hg->Context().device_type, XPU, {
ATEN_ID_TYPE_SWITCH(seeds->dtype, IdxType, {
vtypes = impl::GetNodeTypesFromMetapath<XPU, IdxType>(hg, metapath);
vids = impl::RandomWalkWithStepwiseRestart<XPU, IdxType>(
hg, seeds, metapath, prob, restart_prob);
});
});
return std::make_pair(vids, vtypes);
}
}; // namespace sampling
DGL_REGISTER_GLOBAL("sampling.randomwalks._CAPI_DGLSamplingRandomWalk")
.set_body([] (DGLArgs args, DGLRetValue *rv) {
HeteroGraphRef hg = args[0];
IdArray seeds = args[1];
TypeArray metapath = args[2];
List<Value> prob = args[3];
std::vector<FloatArray> prob_vec;
prob_vec.reserve(prob.size());
for (Value val : prob)
prob_vec.push_back(val->data);
auto result = sampling::RandomWalk(hg.sptr(), seeds, metapath, prob_vec);
List<Value> ret;
ret.push_back(Value(MakeValue(result.first)));
ret.push_back(Value(MakeValue(result.second)));
*rv = ret;
});
DGL_REGISTER_GLOBAL("sampling.randomwalks._CAPI_DGLSamplingRandomWalkWithRestart")
.set_body([] (DGLArgs args, DGLRetValue *rv) {
HeteroGraphRef hg = args[0];
IdArray seeds = args[1];
TypeArray metapath = args[2];
List<Value> prob = args[3];
double restart_prob = args[4];
std::vector<FloatArray> prob_vec;
prob_vec.reserve(prob.size());
for (Value val : prob)
prob_vec.push_back(val->data);
auto result = sampling::RandomWalkWithRestart(
hg.sptr(), seeds, metapath, prob_vec, restart_prob);
List<Value> ret;
ret.push_back(Value(MakeValue(result.first)));
ret.push_back(Value(MakeValue(result.second)));
*rv = ret;
});
DGL_REGISTER_GLOBAL("sampling.randomwalks._CAPI_DGLSamplingRandomWalkWithStepwiseRestart")
.set_body([] (DGLArgs args, DGLRetValue *rv) {
HeteroGraphRef hg = args[0];
IdArray seeds = args[1];
TypeArray metapath = args[2];
List<Value> prob = args[3];
FloatArray restart_prob = args[4];
std::vector<FloatArray> prob_vec;
prob_vec.reserve(prob.size());
for (Value val : prob)
prob_vec.push_back(val->data);
auto result = sampling::RandomWalkWithStepwiseRestart(
hg.sptr(), seeds, metapath, prob_vec, restart_prob);
List<Value> ret;
ret.push_back(Value(MakeValue(result.first)));
ret.push_back(Value(MakeValue(result.second)));
*rv = ret;
});
DGL_REGISTER_GLOBAL("sampling.randomwalks._CAPI_DGLSamplingPackTraces")
.set_body([] (DGLArgs args, DGLRetValue *rv) {
IdArray vids = args[0];
TypeArray vtypes = args[1];
IdArray concat_vids, concat_vtypes, lengths, offsets;
std::tie(concat_vids, lengths, offsets) = Pack(vids, -1);
std::tie(concat_vtypes, std::ignore) = ConcatSlices(vtypes, lengths);
List<Value> ret;
ret.push_back(Value(MakeValue(concat_vids)));
ret.push_back(Value(MakeValue(concat_vtypes)));
ret.push_back(Value(MakeValue(lengths)));
ret.push_back(Value(MakeValue(offsets)));
*rv = ret;
});
}; // namespace dgl
/*!
* Copyright (c) 2018 by Contributors
* \file graph/sampler/generic_randomwalk_cpu.h
* \brief DGL sampler - templated implementation definition of random walks on CPU
*/
#ifndef DGL_GRAPH_SAMPLING_RANDOMWALKS_CPU_H_
#define DGL_GRAPH_SAMPLING_RANDOMWALKS_CPU_H_
#include <dgl/base_heterograph.h>
#include <dgl/array.h>
#include "randomwalks_impl.h"
namespace dgl {
using namespace dgl::runtime;
using namespace dgl::aten;
namespace sampling {
namespace impl {
namespace {
/*!
* \brief Generic Random Walk.
* \param seeds A 1D array of seed nodes, with the type the source type of the first
* edge type in the metapath.
* \param max_num_steps The maximum number of steps of a random walk path.
* \param step The random walk step function with type \c StepFunc.
* \return A 2D array of shape (len(seeds), max_num_steps + 1) with node IDs.
* \note The graph itself should be bounded in the closure of \c step.
*/
template<DLDeviceType XPU, typename IdxType>
IdArray GenericRandomWalk(
const IdArray seeds,
int64_t max_num_steps,
StepFunc<IdxType> step) {
int64_t num_seeds = seeds->shape[0];
int64_t trace_length = max_num_steps + 1;
IdArray traces = IdArray::Empty({num_seeds, trace_length}, seeds->dtype, seeds->ctx);
const IdxType *seed_data = static_cast<IdxType *>(seeds->data);
IdxType *traces_data = static_cast<IdxType *>(traces->data);
#pragma omp parallel for
for (int64_t seed_id = 0; seed_id < num_seeds; ++seed_id) {
int64_t i;
dgl_id_t curr = seed_data[seed_id];
traces_data[seed_id * trace_length] = curr;
for (i = 0; i < max_num_steps; ++i) {
const auto &succ = step(traces_data + seed_id * max_num_steps, curr, i);
traces_data[seed_id * trace_length + i + 1] = curr = succ.first;
if (succ.second)
break;
}
for (; i < max_num_steps; ++i)
traces_data[seed_id * trace_length + i + 1] = -1;
}
return traces;
}
}; // namespace
}; // namespace impl
}; // namespace sampling
}; // namespace dgl
#endif // DGL_GRAPH_SAMPLING_RANDOMWALKS_CPU_H_
/*!
* Copyright (c) 2018 by Contributors
* \file graph/sampling/randomwalks_impl.h
* \brief DGL sampler - templated implementation definition of random walks
*/
#ifndef DGL_GRAPH_SAMPLING_RANDOMWALKS_IMPL_H_
#define DGL_GRAPH_SAMPLING_RANDOMWALKS_IMPL_H_
#include <dgl/base_heterograph.h>
#include <dgl/array.h>
#include <vector>
#include <utility>
#include <functional>
namespace dgl {
using namespace dgl::runtime;
using namespace dgl::aten;
namespace sampling {
namespace impl {
/*!
* \brief Random walk step function
*/
template<typename IdxType>
using StepFunc = std::function<
// ID terminate?
std::pair<dgl_id_t, bool>(
IdxType *, // node IDs generated so far
dgl_id_t, // last node ID
int64_t)>; // # of steps
/*!
* \brief Get the node types traversed by the metapath.
* \return A 1D array of shape (len(metapath) + 1,) with node type IDs.
*/
template<DLDeviceType XPU, typename IdxType>
TypeArray GetNodeTypesFromMetapath(
const HeteroGraphPtr hg,
const TypeArray metapath);
/*!
* \brief Metapath-based random walk.
* \param hg The heterograph.
* \param seeds A 1D array of seed nodes, with the type the source type of the first
* edge type in the metapath.
* \param metapath A 1D array of edge types representing the metapath.
* \param prob A vector of 1D float arrays, indicating the transition probability of
* each edge by edge type. An empty float array assumes uniform transition.
* \return A 2D array of shape (len(seeds), len(metapath) + 1) with node IDs. The
* paths that terminated early are padded with -1.
* \note This function should be called together with GetNodeTypesFromMetapath to
* determine the node type of each node in the random walk traces.
*/
template<DLDeviceType XPU, typename IdxType>
IdArray RandomWalk(
const HeteroGraphPtr hg,
const IdArray seeds,
const TypeArray metapath,
const std::vector<FloatArray> &prob);
/*!
* \brief Metapath-based random walk with restart probability.
* \param hg The heterograph.
* \param seeds A 1D array of seed nodes, with the type the source type of the first
* edge type in the metapath.
* \param metapath A 1D array of edge types representing the metapath.
* \param prob A vector of 1D float arrays, indicating the transition probability of
* each edge by edge type. An empty float array assumes uniform transition.
* \param restart_prob Restart probability
* \return A 2D array of shape (len(seeds), len(metapath) + 1) with node IDs. The
* paths that terminated early are padded with -1.
* \note This function should be called together with GetNodeTypesFromMetapath to
* determine the node type of each node in the random walk traces.
*/
template<DLDeviceType XPU, typename IdxType>
IdArray RandomWalkWithRestart(
const HeteroGraphPtr hg,
const IdArray seeds,
const TypeArray metapath,
const std::vector<FloatArray> &prob,
double restart_prob);
/*!
* \brief Metapath-based random walk with stepwise restart probability. Useful
* for PinSAGE-like models.
* \param hg The heterograph.
* \param seeds A 1D array of seed nodes, with the type the source type of the first
* edge type in the metapath.
* \param metapath A 1D array of edge types representing the metapath.
* \param prob A vector of 1D float arrays, indicating the transition probability of
* each edge by edge type. An empty float array assumes uniform transition.
* \param restart_prob Restart probability array which has the same number of elements
* as \c metapath, indicating the probability to terminate after transition.
* \return A 2D array of shape (len(seeds), len(metapath) + 1) with node IDs. The
* paths that terminated early are padded with -1.
* \note This function should be called together with GetNodeTypesFromMetapath to
* determine the node type of each node in the random walk traces.
*/
template<DLDeviceType XPU, typename IdxType>
IdArray RandomWalkWithStepwiseRestart(
const HeteroGraphPtr hg,
const IdArray seeds,
const TypeArray metapath,
const std::vector<FloatArray> &prob,
FloatArray restart_prob);
}; // namespace impl
}; // namespace sampling
}; // namespace dgl
#endif // DGL_GRAPH_SAMPLING_RANDOMWALKS_IMPL_H_
......@@ -187,8 +187,8 @@ class UnitGraph::COO : public BaseHeteroGraph {
std::pair<dgl_id_t, dgl_id_t> FindEdge(dgl_type_t etype, dgl_id_t eid) const override {
CHECK(eid < NumEdges(etype)) << "Invalid edge id: " << eid;
const auto src = aten::IndexSelect(adj_.row, eid);
const auto dst = aten::IndexSelect(adj_.col, eid);
const dgl_id_t src = aten::IndexSelect<int64_t>(adj_.row, eid);
const dgl_id_t dst = aten::IndexSelect<int64_t>(adj_.col, eid);
return std::pair<dgl_id_t, dgl_id_t>(src, dst);
}
......
/*!
* Copyright (c) 2019 by Contributors
* \file random/choice.cc
* \brief Non-uniform discrete sampling implementation
*/
#include <dgl/random.h>
#include <algorithm>
#include <utility>
#include <queue>
#include <cstdlib>
#include <cmath>
#include <numeric>
#include <limits>
#include <vector>
#include "sample_utils.h"
namespace dgl {
template<typename IdxType>
IdxType RandomEngine::Choice(FloatArray prob) {
IdxType ret;
ATEN_FLOAT_TYPE_SWITCH(prob->dtype, ValueType, "probability", {
utils::TreeSampler<IdxType, ValueType, true> sampler(this, prob);
ret = sampler.Draw();
});
return ret;
}
template int32_t RandomEngine::Choice<int32_t>(FloatArray);
template int64_t RandomEngine::Choice<int64_t>(FloatArray);
}; // namespace dgl
......@@ -3,9 +3,11 @@
* \file dgl/sample_utils.h
* \brief Sampling utilities
*/
#ifndef DGL_SAMPLE_UTILS_H_
#define DGL_SAMPLE_UTILS_H_
#ifndef DGL_RANDOM_CPU_SAMPLE_UTILS_H_
#define DGL_RANDOM_CPU_SAMPLE_UTILS_H_
#include <dgl/random.h>
#include <dgl/array.h>
#include <algorithm>
#include <utility>
#include <queue>
......@@ -14,7 +16,6 @@
#include <numeric>
#include <limits>
#include <vector>
#include "random.h"
namespace dgl {
namespace utils {
......@@ -49,7 +50,7 @@ class AliasSampler: public BaseSampler<Idx, DType, replace> {
DType accum, taken; // accumulated likelihood
std::vector<Idx> K; // alias table
std::vector<DType> U; // probability table
std::vector<DType> _prob; // category distribution
FloatArray _prob; // category distribution
std::vector<bool> used; // indicate availability, activated when replace=false;
std::vector<Idx> id_mapping; // index mapping, activated when replace=false;
......@@ -60,16 +61,18 @@ class AliasSampler: public BaseSampler<Idx, DType, replace> {
return id_mapping[x];
}
void Reconstruct(const std::vector<DType>& prob) { // Reconstruct alias table
void Reconstruct(FloatArray prob) { // Reconstruct alias table
const int64_t prob_size = prob->shape[0];
const DType *prob_data = static_cast<DType *>(prob->data);
N = 0;
accum = 0.;
taken = 0.;
if (!replace)
id_mapping.clear();
for (Idx i = 0; i < prob.size(); ++i)
for (Idx i = 0; i < prob_size; ++i)
if (!used[i]) {
N++;
accum += prob[i];
accum += prob_data[i];
if (!replace)
id_mapping.push_back(i);
}
......@@ -80,7 +83,7 @@ class AliasSampler: public BaseSampler<Idx, DType, replace> {
std::fill(U.begin(), U.end(), avg); // initialize U
std::queue<std::pair<Idx, DType> > under, over;
for (Idx i = 0; i < N; ++i) {
DType p = prob[Map(i)];
DType p = prob_data[Map(i)];
if (p > avg)
over.push(std::make_pair(i, p));
else
......@@ -103,21 +106,22 @@ class AliasSampler: public BaseSampler<Idx, DType, replace> {
}
public:
void ResetState(const std::vector<DType>& prob) {
used.resize(prob.size());
void ResetState(FloatArray prob) {
used.resize(prob->shape[0]);
if (!replace)
_prob = prob;
std::fill(used.begin(), used.end(), false);
Reconstruct(prob);
}
explicit AliasSampler(RandomEngine* re, const std::vector<DType>& prob): re(re) {
explicit AliasSampler(RandomEngine* re, FloatArray prob): re(re) {
ResetState(prob);
}
~AliasSampler() {}
Idx Draw() {
const DType *_prob_data = static_cast<DType *>(_prob->data);
DType avg = accum / N;
if (!replace) {
if (2 * taken >= accum)
......@@ -131,7 +135,7 @@ class AliasSampler: public BaseSampler<Idx, DType, replace> {
} else {
rst = Map(K[i]);
}
DType cap = _prob[rst];
DType cap = _prob_data[rst];
if (!used[rst]) {
used[rst] = true;
taken += cap;
......@@ -166,7 +170,7 @@ class CDFSampler: public BaseSampler<Idx, DType, replace> {
RandomEngine *re;
Idx N;
DType accum, taken;
std::vector<DType> _prob; // categorical distribution
FloatArray _prob; // categorical distribution
std::vector<DType> cdf; // cumulative distribution function
std::vector<bool> used; // indicate availability, activated when replace=false;
std::vector<Idx> id_mapping; // indicate index mapping, activated when replace=false;
......@@ -178,7 +182,9 @@ class CDFSampler: public BaseSampler<Idx, DType, replace> {
return id_mapping[x];
}
void Reconstruct(const std::vector<DType>& prob) { // Reconstruct CDF
void Reconstruct(FloatArray prob) { // Reconstruct CDF
int64_t prob_size = prob->shape[0];
const DType *prob_data = static_cast<DType *>(prob->data);
N = 0;
accum = 0.;
taken = 0.;
......@@ -186,10 +192,10 @@ class CDFSampler: public BaseSampler<Idx, DType, replace> {
id_mapping.clear();
cdf.clear();
cdf.push_back(0);
for (Idx i = 0; i < prob.size(); ++i)
for (Idx i = 0; i < prob_size; ++i)
if (!used[i]) {
N++;
accum += prob[i];
accum += prob_data[i];
if (!replace)
id_mapping.push_back(i);
cdf.push_back(accum);
......@@ -198,21 +204,22 @@ class CDFSampler: public BaseSampler<Idx, DType, replace> {
}
public:
void ResetState(const std::vector<DType>& prob) {
used.resize(prob.size());
void ResetState(FloatArray prob) {
used.resize(prob->shape[0]);
if (!replace)
_prob = prob;
std::fill(used.begin(), used.end(), false);
Reconstruct(prob);
}
explicit CDFSampler(RandomEngine *re, const std::vector<DType>& prob): re(re) {
explicit CDFSampler(RandomEngine *re, FloatArray prob): re(re) {
ResetState(prob);
}
~CDFSampler() {}
Idx Draw() {
const DType *_prob_data = static_cast<DType *>(_prob->data);
DType eps = std::numeric_limits<DType>::min();
if (!replace) {
if (2 * taken >= accum)
......@@ -220,7 +227,7 @@ class CDFSampler: public BaseSampler<Idx, DType, replace> {
while (true) {
DType p = std::max(re->Uniform<DType>(0., accum), eps);
Idx rst = Map(std::lower_bound(cdf.begin(), cdf.end(), p) - cdf.begin() - 1);
DType cap = _prob[rst];
DType cap = _prob_data[rst];
if (!used[rst]) {
used[rst] = true;
taken += cap;
......@@ -249,20 +256,23 @@ class TreeSampler: public BaseSampler<Idx, DType, replace> {
private:
RandomEngine *re;
std::vector<DType> weight; // accumulated likelihood of subtrees.
int64_t N, num_leafs;
int64_t N;
int64_t num_leafs;
public:
void ResetState(const std::vector<DType>& prob) {
void ResetState(FloatArray prob) {
int64_t prob_size = prob->shape[0];
const DType *prob_data = static_cast<DType *>(prob->data);
std::fill(weight.begin(), weight.end(), 0);
for (int i = 0; i < prob.size(); ++i)
weight[num_leafs + i] = prob[i];
for (int i = num_leafs - 1; i >= 1; --i)
for (int64_t i = 0; i < prob_size; ++i)
weight[num_leafs + i] = prob_data[i];
for (int64_t i = num_leafs - 1; i >= 1; --i)
weight[i] = weight[i * 2] + weight[i * 2 + 1];
}
explicit TreeSampler(RandomEngine *re, const std::vector<DType>& prob): re(re) {
explicit TreeSampler(RandomEngine *re, FloatArray prob): re(re) {
num_leafs = 1;
while (num_leafs < prob.size())
while (num_leafs < prob->shape[0])
num_leafs *= 2;
N = num_leafs * 2;
weight.resize(N);
......@@ -299,4 +309,4 @@ class TreeSampler: public BaseSampler<Idx, DType, replace> {
}; // namespace utils
}; // namespace dgl
#endif // DGL_SAMPLE_UTILS_H_
#endif // DGL_RANDOM_CPU_SAMPLE_UTILS_H_
......@@ -223,6 +223,46 @@ void NDArray::CopyFromTo(DLTensor* from,
from_size, from->ctx, to->ctx, from->dtype, stream);
}
template<typename T>
NDArray NDArray::FromVector(const std::vector<T>& vec, DLDataType dtype, DLContext ctx) {
int64_t size = static_cast<int64_t>(vec.size());
NDArray ret = NDArray::Empty({size}, dtype, DLContext{kDLCPU, 0});
DeviceAPI::Get(ctx)->CopyDataFromTo(
vec.data(),
0,
static_cast<T*>(ret->data),
0,
size * sizeof(T),
DLContext{kDLCPU, 0},
ctx,
dtype,
nullptr);
return ret;
}
// export specializations
template NDArray NDArray::FromVector(const std::vector<int32_t>&, DLDataType, DLContext);
template NDArray NDArray::FromVector(const std::vector<int64_t>&, DLDataType, DLContext);
template NDArray NDArray::FromVector(const std::vector<uint32_t>&, DLDataType, DLContext);
template NDArray NDArray::FromVector(const std::vector<uint64_t>&, DLDataType, DLContext);
template NDArray NDArray::FromVector(const std::vector<float>&, DLDataType, DLContext);
template NDArray NDArray::FromVector(const std::vector<double>&, DLDataType, DLContext);
// specializations of FromVector
#define GEN_FROMVECTOR_FOR(T, DTypeCode, DTypeBits) \
template<> \
NDArray NDArray::FromVector<T>(const std::vector<T> &vec, DLContext ctx) { \
return FromVector<T>(vec, DLDataType{DTypeCode, DTypeBits, 1}, ctx); \
}
GEN_FROMVECTOR_FOR(int32_t, kDLInt, 32);
GEN_FROMVECTOR_FOR(int64_t, kDLInt, 64);
// XXX(BarclayII) most DL frameworks do not support unsigned int and long arrays, so I'm just
// converting uints to signed NDArrays.
GEN_FROMVECTOR_FOR(uint32_t, kDLInt, 32);
GEN_FROMVECTOR_FOR(uint64_t, kDLInt, 64);
GEN_FROMVECTOR_FOR(float, kDLFloat, 32);
GEN_FROMVECTOR_FOR(double, kDLFloat, 64);
} // namespace runtime
} // namespace dgl
......
import dgl
import backend as F
import numpy as np
import unittest
def check_random_walk(g, metapath, traces, ntypes, prob=None):
traces = F.asnumpy(traces)
ntypes = F.asnumpy(ntypes)
for j in range(traces.shape[1] - 1):
assert ntypes[j] == g.get_ntype_id(g.to_canonical_etype(metapath[j])[0])
assert ntypes[j + 1] == g.get_ntype_id(g.to_canonical_etype(metapath[j])[2])
for i in range(traces.shape[0]):
for j in range(traces.shape[1] - 1):
assert g.has_edge_between(
traces[i, j], traces[i, j+1], etype=metapath[j])
if prob is not None and prob in g.edges[metapath[j]].data:
p = F.asnumpy(g.edges[metapath[j]].data['p'])
eids = g.edge_id(traces[i, j], traces[i, j+1], etype=metapath[j])
assert p[eids] != 0
@unittest.skipIf(F._default_context_str == 'gpu', reason="GPU random walk not implemented")
def test_random_walk():
g1 = dgl.heterograph({
('user', 'follow', 'user'): [(0, 1), (1, 2), (2, 0)]
})
g2 = dgl.heterograph({
('user', 'follow', 'user'): [(0, 1), (1, 2), (1, 3), (2, 0), (3, 0)]
})
g3 = dgl.heterograph({
('user', 'follow', 'user'): [(0, 1), (1, 2), (2, 0)],
('user', 'view', 'item'): [(0, 0), (1, 1), (2, 2)],
('item', 'viewed-by', 'user'): [(0, 0), (1, 1), (2, 2)]})
g4 = dgl.heterograph({
('user', 'follow', 'user'): [(0, 1), (1, 2), (1, 3), (2, 0), (3, 0)],
('user', 'view', 'item'): [(0, 0), (0, 1), (1, 1), (2, 2), (3, 2), (3, 1)],
('item', 'viewed-by', 'user'): [(0, 0), (1, 0), (1, 1), (2, 2), (2, 3), (1, 3)]})
g2.edata['p'] = F.tensor([3, 0, 3, 3, 3], dtype=F.float32)
g4.edges['follow'].data['p'] = F.tensor([3, 0, 3, 3, 3], dtype=F.float32)
g4.edges['viewed-by'].data['p'] = F.tensor([1, 1, 1, 1, 1, 1], dtype=F.float32)
traces, ntypes = dgl.sampling.random_walk(g1, [0, 1, 2, 0, 1, 2], length=4)
check_random_walk(g1, ['follow'] * 4, traces, ntypes)
traces, ntypes = dgl.sampling.random_walk(g1, [0, 1, 2, 0, 1, 2], length=4, restart_prob=0.)
check_random_walk(g1, ['follow'] * 4, traces, ntypes)
traces, ntypes = dgl.sampling.random_walk(
g1, [0, 1, 2, 0, 1, 2], length=4, restart_prob=F.zeros((4,), F.float32, F.cpu()))
check_random_walk(g1, ['follow'] * 4, traces, ntypes)
traces, ntypes = dgl.sampling.random_walk(
g1, [0, 1, 2, 0, 1, 2], length=5,
restart_prob=F.tensor([0, 0, 0, 0, 1], dtype=F.float32))
check_random_walk(
g1, ['follow'] * 4, F.slice_axis(traces, 1, 0, 5), F.slice_axis(ntypes, 0, 0, 5))
assert (F.asnumpy(traces)[:, 5] == -1).all()
traces, ntypes = dgl.sampling.random_walk(
g2, [0, 1, 2, 3, 0, 1, 2, 3], length=4)
check_random_walk(g2, ['follow'] * 4, traces, ntypes)
traces, ntypes = dgl.sampling.random_walk(
g2, [0, 1, 2, 3, 0, 1, 2, 3], length=4, prob='p')
check_random_walk(g2, ['follow'] * 4, traces, ntypes, 'p')
metapath = ['follow', 'view', 'viewed-by'] * 2
traces, ntypes = dgl.sampling.random_walk(
g3, [0, 1, 2, 0, 1, 2], metapath=metapath)
check_random_walk(g3, metapath, traces, ntypes)
metapath = ['follow', 'view', 'viewed-by'] * 2
traces, ntypes = dgl.sampling.random_walk(
g4, [0, 1, 2, 3, 0, 1, 2, 3], metapath=metapath)
check_random_walk(g4, metapath, traces, ntypes)
metapath = ['follow', 'view', 'viewed-by'] * 2
traces, ntypes = dgl.sampling.random_walk(
g4, [0, 1, 2, 3, 0, 1, 2, 3], metapath=metapath, prob='p')
check_random_walk(g4, metapath, traces, ntypes, 'p')
traces, ntypes = dgl.sampling.random_walk(
g4, [0, 1, 2, 3, 0, 1, 2, 3], metapath=metapath, prob='p', restart_prob=0.)
check_random_walk(g4, metapath, traces, ntypes, 'p')
traces, ntypes = dgl.sampling.random_walk(
g4, [0, 1, 2, 3, 0, 1, 2, 3], metapath=metapath, prob='p',
restart_prob=F.zeros((6,), F.float32, F.cpu()))
check_random_walk(g4, metapath, traces, ntypes, 'p')
traces, ntypes = dgl.sampling.random_walk(
g4, [0, 1, 2, 3, 0, 1, 2, 3], metapath=metapath + ['follow'], prob='p',
restart_prob=F.tensor([0, 0, 0, 0, 0, 0, 1], F.float32))
check_random_walk(g4, metapath, traces[:, :7], ntypes[:7], 'p')
assert (F.asnumpy(traces[:, 7]) == -1).all()
@unittest.skipIf(F._default_context_str == 'gpu', reason="GPU pack traces not implemented")
def test_pack_traces():
traces, types = (np.array(
[[ 0, 1, -1, -1, -1, -1, -1],
[ 0, 1, 1, 3, 0, 0, 0]], dtype='int64'),
np.array([0, 0, 1, 0, 0, 1, 0], dtype='int64'))
traces = F.zerocopy_from_numpy(traces)
types = F.zerocopy_from_numpy(types)
result = dgl.sampling.pack_traces(traces, types)
assert F.array_equal(result[0], F.tensor([0, 1, 0, 1, 1, 3, 0, 0, 0], dtype=F.int64))
assert F.array_equal(result[1], F.tensor([0, 0, 0, 0, 1, 0, 0, 1, 0], dtype=F.int64))
assert F.array_equal(result[2], F.tensor([2, 7], dtype=F.int64))
assert F.array_equal(result[3], F.tensor([0, 2], dtype=F.int64))
if __name__ == '__main__':
test_random_walk()
test_pack_traces()
......@@ -144,7 +144,7 @@ TEST(ArrayTest, TestHStack) {
template <typename IDX>
void _TestIndexSelect() {
IdArray a = aten::Range(0, 100, sizeof(IDX)*8, CTX);
ASSERT_EQ(aten::IndexSelect(a, 50), 50);
ASSERT_EQ(aten::IndexSelect<int>(a, 50), 50);
IdArray b = aten::VecToIdArray(std::vector<IDX>({0, 20, 10}), sizeof(IDX)*8, CTX);
IdArray c = aten::IndexSelect(a, b);
ASSERT_TRUE(ArrayEQ<IDX>(b, c));
......
#include <gtest/gtest.h>
#include <dgl/sample_utils.h>
#include <vector>
#include <algorithm>
#include <iostream>
#include "./common.h"
#include "../../src/random/cpu/sample_utils.h"
using namespace dgl;
using namespace dgl::aten;
// TODO: adapt this to Random::Choice
template <typename Idx, typename DType>
void _TestWithReplacement(RandomEngine *re) {
Idx n_categories = 100;
Idx n_rolls = 1000000;
std::vector<DType> prob;
std::vector<DType> _prob;
DType accum = 0.;
for (Idx i = 0; i < n_categories; ++i) {
prob.push_back(re->Uniform<DType>());
accum += prob.back();
_prob.push_back(re->Uniform<DType>());
accum += _prob.back();
}
for (Idx i = 0; i < n_categories; ++i)
prob[i] /= accum;
_prob[i] /= accum;
FloatArray prob = NDArray::FromVector(_prob);
auto _check_given_sampler = [n_categories, n_rolls, &prob](
auto _check_given_sampler = [n_categories, n_rolls, &_prob](
utils::BaseSampler<Idx, DType, true> *s) {
std::vector<Idx> counter(n_categories, 0);
for (Idx i = 0; i < n_rolls; ++i) {
......@@ -28,7 +32,17 @@ void _TestWithReplacement(RandomEngine *re) {
counter[dice]++;
}
for (Idx i = 0; i < n_categories; ++i)
ASSERT_NEAR(static_cast<DType>(counter[i]) / n_rolls, prob[i], 1e-2);
ASSERT_NEAR(static_cast<DType>(counter[i]) / n_rolls, _prob[i], 1e-2);
};
auto _check_random_choice = [n_categories, n_rolls, &_prob, prob]() {
std::vector<int64_t> counter(n_categories, 0);
for (Idx i = 0; i < n_rolls; ++i) {
Idx dice = RandomEngine::ThreadLocal()->Choice<int64_t>(prob);
counter[dice]++;
}
for (Idx i = 0; i < n_categories; ++i)
ASSERT_NEAR(static_cast<DType>(counter[i]) / n_rolls, _prob[i], 1e-2);
};
utils::AliasSampler<Idx, DType, true> as(re, prob);
......@@ -37,6 +51,7 @@ void _TestWithReplacement(RandomEngine *re) {
_check_given_sampler(&as);
_check_given_sampler(&cs);
_check_given_sampler(&ts);
_check_random_choice();
}
TEST(SampleUtilsTest, TestWithReplacement) {
......@@ -53,7 +68,9 @@ TEST(SampleUtilsTest, TestWithReplacement) {
template <typename Idx, typename DType>
void _TestWithoutReplacementOrder(RandomEngine *re) {
std::vector<DType> prob = {1e6, 1e-6, 1e-2, 1e2};
// TODO(BarclayII): is there a reliable way to do this test?
std::vector<DType> _prob = {1e6, 1e-6, 1e-2, 1e2};
FloatArray prob = NDArray::FromVector(_prob);
std::vector<Idx> ground_truth = {0, 3, 2, 1};
auto _check_given_sampler = [&ground_truth](
......@@ -87,9 +104,10 @@ TEST(SampleUtilsTest, TestWithoutReplacementOrder) {
template <typename Idx, typename DType>
void _TestWithoutReplacementUnique(RandomEngine *re) {
Idx N = 1000000;
std::vector<DType> likelihood;
std::vector<DType> _likelihood;
for (Idx i = 0; i < N; ++i)
likelihood.push_back(re->Uniform<DType>());
_likelihood.push_back(re->Uniform<DType>());
FloatArray likelihood = NDArray::FromVector(_likelihood);
auto _check_given_sampler = [N](
utils::BaseSampler<Idx, DType, false> *s) {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment