metapath_randomwalk.h 8.57 KB
Newer Older
1
2
3
4
5
6
/*!
 *  Copyright (c) 2018 by Contributors
 * \file graph/sampler/generic_randomwalk_cpu.h
 * \brief DGL sampler - templated implementation definition of random walks on CPU
 */

7
8
#ifndef DGL_GRAPH_SAMPLING_RANDOMWALKS_METAPATH_RANDOMWALK_H_
#define DGL_GRAPH_SAMPLING_RANDOMWALKS_METAPATH_RANDOMWALK_H_
9
10
11
12
13

#include <dgl/array.h>
#include <dgl/base_heterograph.h>
#include <dgl/random.h>
#include <utility>
Quan (Andy) Gan's avatar
Quan (Andy) Gan committed
14
#include <tuple>
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
#include <vector>
#include "randomwalks_impl.h"
#include "randomwalks_cpu.h"

namespace dgl {

using namespace dgl::runtime;
using namespace dgl::aten;

namespace sampling {

namespace impl {

namespace {

// bool WhetherToTerminate(
//     IdxType *node_ids_generated_so_far,
//     dgl_id_t last_node_id_generated,
//     int64_t number_of_nodes_generated_so_far)
template<typename IdxType>
using TerminatePredicate = std::function<bool(IdxType *, dgl_id_t, int64_t)>;

/*!
 * \brief Select one successor of metapath-based random walk, given the path generated
 * so far.
 *
 * \param data The path generated so far, of type \c IdxType.
 * \param curr The last node ID generated.
 * \param len The number of nodes generated so far.  Note that the seed node is always
 *            included as \c data[0], and the successors start from \c data[1].
 *
 * \param edges_by_type Vector of results from \c GetAdj() by edge type.
 * \param metapath_data Edge types of given metapath.
 * \param prob Transition probability per edge type.
 * \param terminate Predicate for terminating the current random walk path.
 *
Quan (Andy) Gan's avatar
Quan (Andy) Gan committed
51
52
 * \return A tuple of ID of next successor (-1 if not exist), the last traversed edge
 *         ID, as well as whether to terminate.
53
54
 */
template<DLDeviceType XPU, typename IdxType>
Quan (Andy) Gan's avatar
Quan (Andy) Gan committed
55
std::tuple<dgl_id_t, dgl_id_t, bool> MetapathRandomWalkStep(
56
57
58
    IdxType *data,
    dgl_id_t curr,
    int64_t len,
Quan (Andy) Gan's avatar
Quan (Andy) Gan committed
59
    const std::vector<CSRMatrix> &edges_by_type,
60
61
62
63
64
65
66
67
68
69
    const IdxType *metapath_data,
    const std::vector<FloatArray> &prob,
    TerminatePredicate<IdxType> terminate) {
  dgl_type_t etype = metapath_data[len];

  // Note that since the selection of successors is very lightweight (especially in the
  // uniform case), we want to reduce the overheads (even from object copies or object
  // construction) as much as possible.
  // Using Successors() slows down by 2x.
  // Using OutEdges() slows down by 10x.
Quan (Andy) Gan's avatar
Quan (Andy) Gan committed
70
71
72
73
  const CSRMatrix &csr = edges_by_type[etype];
  const IdxType *offsets = csr.indptr.Ptr<IdxType>();
  const IdxType *all_succ = csr.indices.Ptr<IdxType>();
  const IdxType *all_eids = CSRHasData(csr) ? csr.data.Ptr<IdxType>() : nullptr;
74
  const IdxType *succ = all_succ + offsets[curr];
Quan (Andy) Gan's avatar
Quan (Andy) Gan committed
75
  const IdxType *eids = all_eids ? (all_eids + offsets[curr]) : nullptr;
76
77
78

  const int64_t size = offsets[curr + 1] - offsets[curr];
  if (size == 0)
Quan (Andy) Gan's avatar
Quan (Andy) Gan committed
79
    return std::make_tuple(-1, -1, true);
80

81
82
83
84
  // Use a reference to the original array instead of copying
  // This avoids updating the ref counts atomically from different threads
  // and avoids cache ping-ponging in the tight loop
  const FloatArray &prob_etype = prob[etype];
Jinjing Zhou's avatar
Jinjing Zhou committed
85
  IdxType idx = 0;
86
  if (IsNullArray(prob_etype)) {
87
88
89
90
91
    // empty probability array; assume uniform
    idx = RandomEngine::ThreadLocal()->RandInt(size);
  } else {
    ATEN_FLOAT_TYPE_SWITCH(prob_etype->dtype, DType, "probability", {
      FloatArray prob_selected = FloatArray::Empty({size}, prob_etype->dtype, prob_etype->ctx);
Quan (Andy) Gan's avatar
Quan (Andy) Gan committed
92
93
      DType *prob_selected_data = prob_selected.Ptr<DType>();
      const DType *prob_etype_data = prob_etype.Ptr<DType>();
94
      for (int64_t j = 0; j < size; ++j)
Quan (Andy) Gan's avatar
Quan (Andy) Gan committed
95
        prob_selected_data[j] = prob_etype_data[eids ? eids[j] : j + offsets[curr]];
96
97
98
      idx = RandomEngine::ThreadLocal()->Choice<IdxType>(prob_selected);
    });
  }
Quan (Andy) Gan's avatar
Quan (Andy) Gan committed
99
  dgl_id_t eid = eids ? eids[idx] : (idx + offsets[curr]);
100

Quan (Andy) Gan's avatar
Quan (Andy) Gan committed
101
  return std::make_tuple(succ[idx], eid, terminate(data, curr, len));
102
103
}

104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
/*!
 * \brief Select one successor of metapath-based random walk, given the path generated
 * so far specifically for the uniform probability distribution.
 *
 * \param data The path generated so far, of type \c IdxType.
 * \param curr The last node ID generated.
 * \param len The number of nodes generated so far.  Note that the seed node is always
 *            included as \c data[0], and the successors start from \c data[1].
 *
 * \param edges_by_type Vector of results from \c GetAdj() by edge type.
 * \param metapath_data Edge types of given metapath.
 * \param prob Transition probability per edge type, for this special case this will be a NullArray
 * \param terminate Predicate for terminating the current random walk path.
 *
 * \return A pair of ID of next successor (-1 if not exist), as well as whether to terminate.
 * \note This function is called only if all the probability arrays are null.
 */
template<DLDeviceType XPU, typename IdxType>
Quan (Andy) Gan's avatar
Quan (Andy) Gan committed
122
std::tuple<dgl_id_t, dgl_id_t, bool> MetapathRandomWalkStepUniform(
123
124
125
    IdxType *data,
    dgl_id_t curr,
    int64_t len,
Quan (Andy) Gan's avatar
Quan (Andy) Gan committed
126
    const std::vector<CSRMatrix> &edges_by_type,
127
128
129
130
131
132
133
134
135
136
    const IdxType *metapath_data,
    const std::vector<FloatArray> &prob,
    TerminatePredicate<IdxType> terminate) {
  dgl_type_t etype = metapath_data[len];

  // Note that since the selection of successors is very lightweight (especially in the
  // uniform case), we want to reduce the overheads (even from object copies or object
  // construction) as much as possible.
  // Using Successors() slows down by 2x.
  // Using OutEdges() slows down by 10x.
Quan (Andy) Gan's avatar
Quan (Andy) Gan committed
137
138
139
140
  const CSRMatrix &csr = edges_by_type[etype];
  const IdxType *offsets = csr.indptr.Ptr<IdxType>();
  const IdxType *all_succ = csr.indices.Ptr<IdxType>();
  const IdxType *all_eids = CSRHasData(csr) ? csr.data.Ptr<IdxType>() : nullptr;
141
  const IdxType *succ = all_succ + offsets[curr];
Quan (Andy) Gan's avatar
Quan (Andy) Gan committed
142
  const IdxType *eids = all_eids ? (all_eids + offsets[curr]) : nullptr;
143
144
145

  const int64_t size = offsets[curr + 1] - offsets[curr];
  if (size == 0)
Quan (Andy) Gan's avatar
Quan (Andy) Gan committed
146
    return std::make_tuple(-1, -1, true);
147
148
149
150

  IdxType idx = 0;
  // Guaranteed uniform distribution
  idx = RandomEngine::ThreadLocal()->RandInt(size);
Quan (Andy) Gan's avatar
Quan (Andy) Gan committed
151
  dgl_id_t eid = eids ? eids[idx] : (idx + offsets[curr]);
152

Quan (Andy) Gan's avatar
Quan (Andy) Gan committed
153
  return std::make_tuple(succ[idx], eid, terminate(data, curr, len));
154
155
}

156
157
158
159
160
161
162
163
164
/*!
 * \brief Metapath-based random walk.
 * \param hg The heterograph.
 * \param seeds A 1D array of seed nodes, with the type the source type of the first
 *        edge type in the metapath.
 * \param metapath A 1D array of edge types representing the metapath.
 * \param prob A vector of 1D float arrays, indicating the transition probability of
 *        each edge by edge type.  An empty float array assumes uniform transition.
 * \param terminate Predicate for terminating a random walk path.
Quan (Andy) Gan's avatar
Quan (Andy) Gan committed
165
166
 * \return A 2D array of shape (len(seeds), len(metapath) + 1) with node IDs, and
 *         A 2D array of shape (len(seeds), len(metapath)) with edge IDs.
167
168
 */
template<DLDeviceType XPU, typename IdxType>
Quan (Andy) Gan's avatar
Quan (Andy) Gan committed
169
std::pair<IdArray, IdArray> MetapathBasedRandomWalk(
170
171
172
173
174
175
176
177
178
179
180
181
    const HeteroGraphPtr hg,
    const IdArray seeds,
    const TypeArray metapath,
    const std::vector<FloatArray> &prob,
    TerminatePredicate<IdxType> terminate) {
  int64_t max_num_steps = metapath->shape[0];
  const IdxType *metapath_data = static_cast<IdxType *>(metapath->data);

  // Prefetch all edges.
  // This forces the heterograph to materialize all OutCSR's before the OpenMP loop;
  // otherwise data races will happen.
  // TODO(BarclayII): should we later on materialize COO/CSR/CSC anyway unless told otherwise?
Quan (Andy) Gan's avatar
Quan (Andy) Gan committed
182
  std::vector<CSRMatrix> edges_by_type;
183
  for (dgl_type_t etype = 0; etype < hg->NumEdgeTypes(); ++etype)
Quan (Andy) Gan's avatar
Quan (Andy) Gan committed
184
    edges_by_type.push_back(hg->GetCSRMatrix(etype));
185

186
187
188
189
190
191
192
193
194
195
  // Hoist the check for Uniform vs Non uniform edge distribution
  // to avoid putting it on the hot path
  bool isUniform = true;
  for (const auto &etype_prob : prob) {
    if (!IsNullArray(etype_prob)) {
      isUniform = false;
      break;
    }
  }
  if (!isUniform) {
Quan (Andy) Gan's avatar
Quan (Andy) Gan committed
196
    StepFunc<IdxType> step =
197
198
199
200
201
      [&edges_by_type, metapath_data, &prob, terminate]
      (IdxType *data, dgl_id_t curr, int64_t len) {
        return MetapathRandomWalkStep<XPU, IdxType>(
            data, curr, len, edges_by_type, metapath_data, prob, terminate);
      };
Quan (Andy) Gan's avatar
Quan (Andy) Gan committed
202
    return GenericRandomWalk<XPU, IdxType>(seeds, max_num_steps, step);
203
  } else {
Quan (Andy) Gan's avatar
Quan (Andy) Gan committed
204
    StepFunc<IdxType> step =
205
206
207
208
209
      [&edges_by_type, metapath_data, &prob, terminate]
      (IdxType *data, dgl_id_t curr, int64_t len) {
        return MetapathRandomWalkStepUniform<XPU, IdxType>(
            data, curr, len, edges_by_type, metapath_data, prob, terminate);
      };
Quan (Andy) Gan's avatar
Quan (Andy) Gan committed
210
    return GenericRandomWalk<XPU, IdxType>(seeds, max_num_steps, step);
211
  }
212
213
214
215
216
217
218
219
220
221
}

};  // namespace

};  // namespace impl

};  // namespace sampling

};  // namespace dgl

222
#endif  // DGL_GRAPH_SAMPLING_RANDOMWALKS_METAPATH_RANDOMWALK_H_