metapath_randomwalk.h 9.03 KB
Newer Older
1
/**
2
 *  Copyright (c) 2018 by Contributors
3
4
 * @file graph/sampler/generic_randomwalk_cpu.h
 * @brief DGL sampler - templated implementation definition of random walks on
5
 *     CPU.
6
7
 */

8
9
#ifndef DGL_GRAPH_SAMPLING_RANDOMWALKS_METAPATH_RANDOMWALK_H_
#define DGL_GRAPH_SAMPLING_RANDOMWALKS_METAPATH_RANDOMWALK_H_
10
11
12
13

#include <dgl/array.h>
#include <dgl/base_heterograph.h>
#include <dgl/random.h>
14

Quan (Andy) Gan's avatar
Quan (Andy) Gan committed
15
#include <tuple>
16
#include <utility>
17
#include <vector>
18

19
#include "randomwalks_cpu.h"
20
#include "randomwalks_impl.h"
21
22
23
24
25
26
27
28
29
30
31
32

namespace dgl {

using namespace dgl::runtime;
using namespace dgl::aten;

namespace sampling {

namespace impl {

namespace {

33
template <typename IdxType>
34
35
using TerminatePredicate = std::function<bool(IdxType *, dgl_id_t, int64_t)>;

36
/**
37
 * @brief Select one successor of metapath-based random walk, given the path
38
 *     generated so far.
39
 *
40
41
42
 * @param data The path generated so far, of type \c IdxType.
 * @param curr The last node ID generated.
 * @param len The number of nodes generated so far.  Note that the seed node is
43
 *     always included as \c data[0], and the successors start from \c data[1].
44
 *
45
46
47
48
 * @param edges_by_type Vector of results from \c GetAdj() by edge type.
 * @param metapath_data Edge types of given metapath.
 * @param prob Transition probability per edge type.
 * @param terminate Predicate for terminating the current random walk path.
49
 *
50
 * @return A tuple of ID of next successor (-1 if not exist), the last traversed
51
 *     edge ID, as well as whether to terminate.
52
 */
53
template <DGLDeviceType XPU, typename IdxType>
Quan (Andy) Gan's avatar
Quan (Andy) Gan committed
54
std::tuple<dgl_id_t, dgl_id_t, bool> MetapathRandomWalkStep(
55
    IdxType *data, dgl_id_t curr, int64_t len,
Quan (Andy) Gan's avatar
Quan (Andy) Gan committed
56
    const std::vector<CSRMatrix> &edges_by_type,
57
    const std::vector<bool> &csr_has_data, const IdxType *metapath_data,
58
59
60
61
    const std::vector<FloatArray> &prob,
    TerminatePredicate<IdxType> terminate) {
  dgl_type_t etype = metapath_data[len];

62
63
64
65
  // Note that since the selection of successors is very lightweight (especially
  // in the uniform case), we want to reduce the overheads (even from object
  // copies or object construction) as much as possible. Using Successors()
  // slows down by 2x. Using OutEdges() slows down by 10x.
Quan (Andy) Gan's avatar
Quan (Andy) Gan committed
66
67
68
  const CSRMatrix &csr = edges_by_type[etype];
  const IdxType *offsets = csr.indptr.Ptr<IdxType>();
  const IdxType *all_succ = csr.indices.Ptr<IdxType>();
69
70
  const IdxType *all_eids =
      csr_has_data[etype] ? csr.data.Ptr<IdxType>() : nullptr;
71
  const IdxType *succ = all_succ + offsets[curr];
Quan (Andy) Gan's avatar
Quan (Andy) Gan committed
72
  const IdxType *eids = all_eids ? (all_eids + offsets[curr]) : nullptr;
73
74

  const int64_t size = offsets[curr + 1] - offsets[curr];
75
  if (size == 0) return std::make_tuple(-1, -1, true);
76

77
78
79
  // Use a reference to the original array instead of copying. This avoids
  // updating the ref counts atomically from different threads and avoids cache
  // ping-ponging in the tight loop.
80
  const FloatArray &prob_etype = prob[etype];
Jinjing Zhou's avatar
Jinjing Zhou committed
81
  IdxType idx = 0;
82
  if (IsNullArray(prob_etype)) {
83
84
85
86
    // empty probability array; assume uniform
    idx = RandomEngine::ThreadLocal()->RandInt(size);
  } else {
    ATEN_FLOAT_TYPE_SWITCH(prob_etype->dtype, DType, "probability", {
87
88
      FloatArray prob_selected =
          FloatArray::Empty({size}, prob_etype->dtype, prob_etype->ctx);
Quan (Andy) Gan's avatar
Quan (Andy) Gan committed
89
90
      DType *prob_selected_data = prob_selected.Ptr<DType>();
      const DType *prob_etype_data = prob_etype.Ptr<DType>();
91
      for (int64_t j = 0; j < size; ++j)
92
93
        prob_selected_data[j] =
            prob_etype_data[eids ? eids[j] : j + offsets[curr]];
94
95
96
      idx = RandomEngine::ThreadLocal()->Choice<IdxType>(prob_selected);
    });
  }
Quan (Andy) Gan's avatar
Quan (Andy) Gan committed
97
  dgl_id_t eid = eids ? eids[idx] : (idx + offsets[curr]);
98

Quan (Andy) Gan's avatar
Quan (Andy) Gan committed
99
  return std::make_tuple(succ[idx], eid, terminate(data, curr, len));
100
101
}

102
/**
103
 * @brief Select one successor of metapath-based random walk, given the path
104
 *     generated so far specifically for the uniform probability distribution.
105
 *
106
107
108
 * @param data The path generated so far, of type \c IdxType.
 * @param curr The last node ID generated.
 * @param len The number of nodes generated so far.  Note that the seed node is
109
 *     always included as \c data[0], and the successors start from \c data[1].
110
 *
111
112
113
 * @param edges_by_type Vector of results from \c GetAdj() by edge type.
 * @param metapath_data Edge types of given metapath.
 * @param prob Transition probability per edge type, for this special case this
114
115
 *     will be a NullArray.
 * @param terminate Predicate for terminating the current random walk path.
116
 *
117
 * @return A pair of ID of next successor (-1 if not exist), as well as whether
118
119
 *     to terminate. \note This function is called only if all the probability
 *     arrays are null.
120
 */
121
template <DGLDeviceType XPU, typename IdxType>
Quan (Andy) Gan's avatar
Quan (Andy) Gan committed
122
std::tuple<dgl_id_t, dgl_id_t, bool> MetapathRandomWalkStepUniform(
123
    IdxType *data, dgl_id_t curr, int64_t len,
Quan (Andy) Gan's avatar
Quan (Andy) Gan committed
124
    const std::vector<CSRMatrix> &edges_by_type,
125
    const std::vector<bool> &csr_has_data, const IdxType *metapath_data,
126
127
128
129
    const std::vector<FloatArray> &prob,
    TerminatePredicate<IdxType> terminate) {
  dgl_type_t etype = metapath_data[len];

130
131
132
133
  // Note that since the selection of successors is very lightweight (especially
  // in the uniform case), we want to reduce the overheads (even from object
  // copies or object construction) as much as possible. Using Successors()
  // slows down by 2x. Using OutEdges() slows down by 10x.
Quan (Andy) Gan's avatar
Quan (Andy) Gan committed
134
135
136
  const CSRMatrix &csr = edges_by_type[etype];
  const IdxType *offsets = csr.indptr.Ptr<IdxType>();
  const IdxType *all_succ = csr.indices.Ptr<IdxType>();
137
138
  const IdxType *all_eids =
      csr_has_data[etype] ? csr.data.Ptr<IdxType>() : nullptr;
139
  const IdxType *succ = all_succ + offsets[curr];
Quan (Andy) Gan's avatar
Quan (Andy) Gan committed
140
  const IdxType *eids = all_eids ? (all_eids + offsets[curr]) : nullptr;
141
142

  const int64_t size = offsets[curr + 1] - offsets[curr];
143
  if (size == 0) return std::make_tuple(-1, -1, true);
144
145
146
147

  IdxType idx = 0;
  // Guaranteed uniform distribution
  idx = RandomEngine::ThreadLocal()->RandInt(size);
Quan (Andy) Gan's avatar
Quan (Andy) Gan committed
148
  dgl_id_t eid = eids ? eids[idx] : (idx + offsets[curr]);
149

Quan (Andy) Gan's avatar
Quan (Andy) Gan committed
150
  return std::make_tuple(succ[idx], eid, terminate(data, curr, len));
151
152
}

153
/**
154
155
156
 * @brief Metapath-based random walk.
 * @param hg The heterograph.
 * @param seeds A 1D array of seed nodes, with the type the source type of the
157
 *     first edge type in the metapath.
158
159
 * @param metapath A 1D array of edge types representing the metapath.
 * @param prob A vector of 1D float arrays, indicating the transition
160
161
 *     probability of each edge by edge type.  An empty float array assumes
 *     uniform transition.
162
163
 * @param terminate Predicate for terminating a random walk path.
 * @return A 2D array of shape (len(seeds), len(metapath) + 1) with node IDs,
164
 *     and A 2D array of shape (len(seeds), len(metapath)) with edge IDs.
165
 */
166
template <DGLDeviceType XPU, typename IdxType>
Quan (Andy) Gan's avatar
Quan (Andy) Gan committed
167
std::pair<IdArray, IdArray> MetapathBasedRandomWalk(
168
    const HeteroGraphPtr hg, const IdArray seeds, const TypeArray metapath,
169
170
171
172
    const std::vector<FloatArray> &prob,
    TerminatePredicate<IdxType> terminate) {
  int64_t max_num_steps = metapath->shape[0];
  const IdxType *metapath_data = static_cast<IdxType *>(metapath->data);
173
174
  const int64_t begin_ntype =
      hg->meta_graph()->FindEdge(metapath_data[0]).first;
175
  const int64_t max_nodes = hg->NumVertices(begin_ntype);
176
177

  // Prefetch all edges.
178
179
180
181
  // This forces the heterograph to materialize all OutCSR's before the OpenMP
  // loop; otherwise data races will happen.
  // TODO(BarclayII): should we later on materialize COO/CSR/CSC anyway unless
  // told otherwise?
182
183
184
185
186
187
188
189
  int64_t num_etypes = hg->NumEdgeTypes();
  std::vector<CSRMatrix> edges_by_type(num_etypes);
  std::vector<bool> csr_has_data(num_etypes);
  for (int64_t etype = 0; etype < num_etypes; ++etype) {
    const CSRMatrix &csr = hg->GetCSRMatrix(etype);
    edges_by_type[etype] = csr;
    csr_has_data[etype] = CSRHasData(csr);
  }
190

191
192
193
194
195
196
197
198
199
200
  // Hoist the check for Uniform vs Non uniform edge distribution
  // to avoid putting it on the hot path
  bool isUniform = true;
  for (const auto &etype_prob : prob) {
    if (!IsNullArray(etype_prob)) {
      isUniform = false;
      break;
    }
  }
  if (!isUniform) {
201
202
203
204
205
206
207
208
209
    StepFunc<IdxType> step = [&edges_by_type, &csr_has_data, metapath_data,
                              &prob, terminate](
                                 IdxType *data, dgl_id_t curr, int64_t len) {
      return MetapathRandomWalkStep<XPU, IdxType>(
          data, curr, len, edges_by_type, csr_has_data, metapath_data, prob,
          terminate);
    };
    return GenericRandomWalk<XPU, IdxType>(
        seeds, max_num_steps, step, max_nodes);
210
  } else {
211
212
213
214
215
216
217
218
219
    StepFunc<IdxType> step = [&edges_by_type, &csr_has_data, metapath_data,
                              &prob, terminate](
                                 IdxType *data, dgl_id_t curr, int64_t len) {
      return MetapathRandomWalkStepUniform<XPU, IdxType>(
          data, curr, len, edges_by_type, csr_has_data, metapath_data, prob,
          terminate);
    };
    return GenericRandomWalk<XPU, IdxType>(
        seeds, max_num_steps, step, max_nodes);
220
  }
221
222
223
224
225
226
227
228
229
230
}

};  // namespace

};  // namespace impl

};  // namespace sampling

};  // namespace dgl

231
#endif  // DGL_GRAPH_SAMPLING_RANDOMWALKS_METAPATH_RANDOMWALK_H_