"vscode:/vscode.git/clone" did not exist on "1c4fdfe2bbe72f9942231daffe176a5b19692df4"
sampler.cc 30.7 KB
Newer Older
Da Zheng's avatar
Da Zheng committed
1
2
3
4
5
6
7
/*!
 *  Copyright (c) 2018 by Contributors
 * \file graph/sampler.cc
 * \brief DGL sampler implementation
 */

#include <dgl/sampler.h>
8
#include <dmlc/omp.h>
Da Zheng's avatar
Da Zheng committed
9
#include <dgl/immutable_graph.h>
10
#include <dmlc/omp.h>
Da Zheng's avatar
Da Zheng committed
11
#include <algorithm>
12
13
#include <cstdlib>
#include <cmath>
14
#include <numeric>
15
#include "../c_api_common.h"
Da Zheng's avatar
Da Zheng committed
16

17
18
19
20
21
22
using dgl::runtime::DGLArgs;
using dgl::runtime::DGLArgValue;
using dgl::runtime::DGLRetValue;
using dgl::runtime::PackedFunc;
using dgl::runtime::NDArray;

Da Zheng's avatar
Da Zheng committed
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
namespace dgl {

namespace {
/*
 * ArrayHeap is used to sample elements from vector
 */
class ArrayHeap {
 public:
  explicit ArrayHeap(const std::vector<float>& prob) {
    vec_size_ = prob.size();
    bit_len_ = ceil(log2(vec_size_));
    limit_ = 1 << bit_len_;
    // allocate twice the size
    heap_.resize(limit_ << 1, 0);
    // allocate the leaves
    for (int i = limit_; i < vec_size_+limit_; ++i) {
      heap_[i] = prob[i-limit_];
    }
    // iterate up the tree (this is O(m))
    for (int i = bit_len_-1; i >= 0; --i) {
      for (int j = (1 << i); j < (1 << (i + 1)); ++j) {
        heap_[j] = heap_[j << 1] + heap_[(j << 1) + 1];
      }
    }
  }
  ~ArrayHeap() {}

  /*
   * Remove term from index (this costs O(log m) steps)
   */
  void Delete(size_t index) {
    size_t i = index + limit_;
    float w = heap_[i];
    for (int j = bit_len_; j >= 0; --j) {
      heap_[i] -= w;
      i = i >> 1;
    }
  }

  /*
   * Add value w to index (this costs O(log m) steps)
   */
  void Add(size_t index, float w) {
    size_t i = index + limit_;
    for (int j = bit_len_; j >= 0; --j) {
      heap_[i] += w;
      i = i >> 1;
    }
  }

  /*
   * Sample from arrayHeap
   */
  size_t Sample(unsigned int* seed) {
    float xi = heap_[1] * (rand_r(seed)%100/101.0);
    int i = 1;
    while (i < limit_) {
      i = i << 1;
      if (xi >= heap_[i]) {
        xi -= heap_[i];
        i += 1;
      }
    }
    return i - limit_;
  }

  /*
   * Sample a vector by given the size n
   */
  void SampleWithoutReplacement(size_t n, std::vector<size_t>* samples, unsigned int* seed) {
    // sample n elements
    for (size_t i = 0; i < n; ++i) {
      samples->at(i) = this->Sample(seed);
      this->Delete(samples->at(i));
    }
  }

 private:
  int vec_size_;  // sample size
  int bit_len_;   // bit size
  int limit_;
  std::vector<float> heap_;
};

/*
 * Uniformly sample integers from [0, set_size) without replacement.
 */
void RandomSample(size_t set_size, size_t num, std::vector<size_t>* out, unsigned int* seed) {
  std::unordered_set<size_t> sampled_idxs;
  while (sampled_idxs.size() < num) {
    sampled_idxs.insert(rand_r(seed) % set_size);
  }
  out->clear();
  out->insert(out->end(), sampled_idxs.begin(), sampled_idxs.end());
}

/*
 * For a sparse array whose non-zeros are represented by nz_idxs,
 * negate the sparse array and outputs the non-zeros in the negated array.
 */
void NegateArray(const std::vector<size_t> &nz_idxs,
                 size_t arr_size,
                 std::vector<size_t>* out) {
  // nz_idxs must have been sorted.
  auto it = nz_idxs.begin();
  size_t i = 0;
  CHECK_GT(arr_size, nz_idxs.back());
  for (; i < arr_size && it != nz_idxs.end(); i++) {
    if (*it == i) {
      it++;
      continue;
    }
    out->push_back(i);
  }
  for (; i < arr_size; i++) {
    out->push_back(i);
  }
}

/*
 * Uniform sample vertices from a list of vertices.
 */
void GetUniformSample(const dgl_id_t* edge_id_list,
                      const dgl_id_t* vid_list,
                      const size_t ver_len,
                      const size_t max_num_neighbor,
                      std::vector<dgl_id_t>* out_ver,
                      std::vector<dgl_id_t>* out_edge,
                      unsigned int* seed) {
  // Copy vid_list to output
  if (ver_len <= max_num_neighbor) {
    out_ver->insert(out_ver->end(), vid_list, vid_list + ver_len);
    out_edge->insert(out_edge->end(), edge_id_list, edge_id_list + ver_len);
    return;
  }
  // If we just sample a small number of elements from a large neighbor list.
  std::vector<size_t> sorted_idxs;
  if (ver_len > max_num_neighbor * 2) {
    sorted_idxs.reserve(max_num_neighbor);
    RandomSample(ver_len, max_num_neighbor, &sorted_idxs, seed);
    std::sort(sorted_idxs.begin(), sorted_idxs.end());
  } else {
    std::vector<size_t> negate;
    negate.reserve(ver_len - max_num_neighbor);
    RandomSample(ver_len, ver_len - max_num_neighbor,
                 &negate, seed);
    std::sort(negate.begin(), negate.end());
    NegateArray(negate, ver_len, &sorted_idxs);
  }
  // verify the result.
  CHECK_EQ(sorted_idxs.size(), max_num_neighbor);
  for (size_t i = 1; i < sorted_idxs.size(); i++) {
    CHECK_GT(sorted_idxs[i], sorted_idxs[i - 1]);
  }
  for (auto idx : sorted_idxs) {
    out_ver->push_back(vid_list[idx]);
    out_edge->push_back(edge_id_list[idx]);
  }
}

/*
 * Non-uniform sample via ArrayHeap
 */
void GetNonUniformSample(const float* probability,
                         const dgl_id_t* edge_id_list,
                         const dgl_id_t* vid_list,
                         const size_t ver_len,
                         const size_t max_num_neighbor,
                         std::vector<dgl_id_t>* out_ver,
                         std::vector<dgl_id_t>* out_edge,
                         unsigned int* seed) {
  // Copy vid_list to output
  if (ver_len <= max_num_neighbor) {
    out_ver->insert(out_ver->end(), vid_list, vid_list + ver_len);
    out_edge->insert(out_edge->end(), edge_id_list, edge_id_list + ver_len);
    return;
  }
  // Make sample
  std::vector<size_t> sp_index(max_num_neighbor);
  std::vector<float> sp_prob(ver_len);
  for (size_t i = 0; i < ver_len; ++i) {
    sp_prob[i] = probability[vid_list[i]];
  }
  ArrayHeap arrayHeap(sp_prob);
  arrayHeap.SampleWithoutReplacement(max_num_neighbor, &sp_index, seed);
  out_ver->resize(max_num_neighbor);
  out_edge->resize(max_num_neighbor);
  for (size_t i = 0; i < max_num_neighbor; ++i) {
    size_t idx = sp_index[i];
    out_ver->at(i) = vid_list[idx];
    out_edge->at(i) = edge_id_list[idx];
  }
  sort(out_ver->begin(), out_ver->end());
  sort(out_edge->begin(), out_edge->end());
}

/*
 * Used for subgraph sampling
 */
struct neigh_list {
  std::vector<dgl_id_t> neighs;
  std::vector<dgl_id_t> edges;
  neigh_list(const std::vector<dgl_id_t> &_neighs,
             const std::vector<dgl_id_t> &_edges)
    : neighs(_neighs), edges(_edges) {}
};

struct neighbor_info {
  dgl_id_t id;
  size_t pos;
  size_t num_edges;

  neighbor_info(dgl_id_t id, size_t pos, size_t num_edges) {
    this->id = id;
    this->pos = pos;
    this->num_edges = num_edges;
  }
};

NodeFlow ConstructNodeFlow(std::vector<dgl_id_t> neighbor_list,
                           std::vector<dgl_id_t> edge_list,
                           std::vector<size_t> layer_offsets,
                           std::vector<std::pair<dgl_id_t, int> > *sub_vers,
                           std::vector<neighbor_info> *neigh_pos,
                           const std::string &edge_type,
                           int64_t num_edges, int num_hops, bool is_multigraph) {
  NodeFlow nf;
  uint64_t num_vertices = sub_vers->size();
251
252
253
254
  nf.node_mapping = NewIdArray(num_vertices);
  nf.edge_mapping = NewIdArray(num_edges);
  nf.layer_offsets = NewIdArray(num_hops + 1);
  nf.flow_offsets = NewIdArray(num_hops);
Da Zheng's avatar
Da Zheng committed
255
256
257
258
259
260
261

  dgl_id_t *node_map_data = static_cast<dgl_id_t *>(nf.node_mapping->data);
  dgl_id_t *layer_off_data = static_cast<dgl_id_t *>(nf.layer_offsets->data);
  dgl_id_t *flow_off_data = static_cast<dgl_id_t *>(nf.flow_offsets->data);
  dgl_id_t *edge_map_data = static_cast<dgl_id_t *>(nf.edge_mapping->data);

  // Construct sub_csr_graph
262
263
264
265
266
  // TODO(minjie): is nodeflow a multigraph?
  auto subg_csr = CSRPtr(new CSR(num_vertices, num_edges, is_multigraph));
  dgl_id_t* indptr_out = static_cast<dgl_id_t*>(subg_csr->indptr()->data);
  dgl_id_t* col_list_out = static_cast<dgl_id_t*>(subg_csr->indices()->data);
  dgl_id_t* eid_out = static_cast<dgl_id_t*>(subg_csr->edge_ids()->data);
Da Zheng's avatar
Da Zheng committed
267
268
269
270
271
272
273
274
275
276
277
278
  size_t collected_nedges = 0;

  // The data from the previous steps:
  // * node data: sub_vers (vid, layer), neigh_pos,
  // * edge data: neighbor_list, edge_list, probability.
  // * layer_offsets: the offset in sub_vers.
  dgl_id_t ver_id = 0;
  std::vector<std::unordered_map<dgl_id_t, dgl_id_t>> layer_ver_maps;
  layer_ver_maps.resize(num_hops);
  size_t out_node_idx = 0;
  for (int layer_id = num_hops - 1; layer_id >= 0; layer_id--) {
    // We sort the vertices in a layer so that we don't need to sort the neighbor Ids
279
280
281
282
283
284
285
286
287
288
289
    // after remap to a subgraph. However, we don't need to sort the first layer
    // because we want the order of the nodes in the first layer is the same as
    // the input seed nodes.
    if (layer_id > 0) {
      std::sort(sub_vers->begin() + layer_offsets[layer_id],
                sub_vers->begin() + layer_offsets[layer_id + 1],
                [](const std::pair<dgl_id_t, dgl_id_t> &a1,
                   const std::pair<dgl_id_t, dgl_id_t> &a2) {
        return a1.first < a2.first;
      });
    }
Da Zheng's avatar
Da Zheng committed
290
291
292
293
294
295

    // Save the sampled vertices and its layer Id.
    for (size_t i = layer_offsets[layer_id]; i < layer_offsets[layer_id + 1]; i++) {
      node_map_data[out_node_idx++] = sub_vers->at(i).first;
      layer_ver_maps[layer_id].insert(std::pair<dgl_id_t, dgl_id_t>(sub_vers->at(i).first,
                                                                    ver_id++));
296
      CHECK_EQ(sub_vers->at(i).second, layer_id);
Da Zheng's avatar
Da Zheng committed
297
298
299
300
301
302
303
304
305
    }
  }
  CHECK(out_node_idx == num_vertices);

  // sampling algorithms have to start from the seed nodes, so the seed nodes are
  // in the first layer and the input nodes are in the last layer.
  // When we expose the sampled graph to a Python user, we say the input nodes
  // are in the first layer and the seed nodes are in the last layer.
  // Thus, when we copy sampled results to a CSR, we need to reverse the order of layers.
306
307
  std::fill(indptr_out, indptr_out + num_vertices + 1, 0);
  size_t row_idx = layer_offsets[num_hops] - layer_offsets[num_hops - 1];
Da Zheng's avatar
Da Zheng committed
308
309
  layer_off_data[0] = 0;
  layer_off_data[1] = layer_offsets[num_hops] - layer_offsets[num_hops - 1];
310
  int out_layer_idx = 1;
Da Zheng's avatar
Da Zheng committed
311
  for (int layer_id = num_hops - 2; layer_id >= 0; layer_id--) {
312
313
314
315
316
317
318
319
320
    // Because we don't sort the vertices in the first layer above, we can't sort
    // the neighbor positions of the vertices in the first layer either.
    if (layer_id > 0) {
      std::sort(neigh_pos->begin() + layer_offsets[layer_id],
                neigh_pos->begin() + layer_offsets[layer_id + 1],
                [](const neighbor_info &a1, const neighbor_info &a2) {
                  return a1.id < a2.id;
                });
    }
Da Zheng's avatar
Da Zheng committed
321
322
323

    for (size_t i = layer_offsets[layer_id]; i < layer_offsets[layer_id + 1]; i++) {
      dgl_id_t dst_id = sub_vers->at(i).first;
324
      CHECK_EQ(dst_id, neigh_pos->at(i).id);
Da Zheng's avatar
Da Zheng committed
325
      size_t pos = neigh_pos->at(i).pos;
326
      CHECK_LE(pos, neighbor_list.size());
327
328
      const size_t nedges = neigh_pos->at(i).num_edges;
      if (neighbor_list.empty()) CHECK_EQ(nedges, 0);
Da Zheng's avatar
Da Zheng committed
329
330
331

      // We need to map the Ids of the neighbors to the subgraph.
      auto neigh_it = neighbor_list.begin() + pos;
332
      for (size_t i = 0; i < nedges; i++) {
Da Zheng's avatar
Da Zheng committed
333
        dgl_id_t neigh = *(neigh_it + i);
334
        CHECK(layer_ver_maps[layer_id + 1].find(neigh) != layer_ver_maps[layer_id + 1].end());
Da Zheng's avatar
Da Zheng committed
335
336
337
338
        col_list_out[collected_nedges + i] = layer_ver_maps[layer_id + 1][neigh];
      }
      // We can simply copy the edge Ids.
      std::copy_n(edge_list.begin() + pos,
339
340
341
                  nedges, edge_map_data + collected_nedges);
      collected_nedges += nedges;
      indptr_out[row_idx+1] = indptr_out[row_idx] + nedges;
Da Zheng's avatar
Da Zheng committed
342
343
344
345
346
347
      row_idx++;
    }
    layer_off_data[out_layer_idx + 1] = layer_off_data[out_layer_idx]
        + layer_offsets[layer_id + 1] - layer_offsets[layer_id];
    out_layer_idx++;
  }
348
349
350
351
  CHECK_EQ(row_idx, num_vertices);
  CHECK_EQ(indptr_out[row_idx], num_edges);
  CHECK_EQ(out_layer_idx, num_hops);
  CHECK_EQ(layer_off_data[out_layer_idx], num_vertices);
Da Zheng's avatar
Da Zheng committed
352
353
354

  // Copy flow offsets.
  flow_off_data[0] = 0;
355
356
  int out_flow_idx = 0;
  for (size_t i = 0; i < layer_offsets.size() - 2; i++) {
357
    size_t num_edges = indptr_out[layer_off_data[i + 2]] - indptr_out[layer_off_data[i + 1]];
Da Zheng's avatar
Da Zheng committed
358
359
360
361
    flow_off_data[out_flow_idx + 1] = flow_off_data[out_flow_idx] + num_edges;
    out_flow_idx++;
  }
  CHECK(out_flow_idx == num_hops - 1);
362
  CHECK(flow_off_data[num_hops - 1] == static_cast<uint64_t>(num_edges));
Da Zheng's avatar
Da Zheng committed
363

364
  std::iota(eid_out, eid_out + num_edges, 0);
Da Zheng's avatar
Da Zheng committed
365

366
367
  if (edge_type == std::string("in")) {
    nf.graph = GraphPtr(new ImmutableGraph(subg_csr, nullptr));
Da Zheng's avatar
Da Zheng committed
368
  } else {
369
    nf.graph = GraphPtr(new ImmutableGraph(nullptr, subg_csr));
Da Zheng's avatar
Da Zheng committed
370
371
372
373
374
375
  }

  return nf;
}

NodeFlow SampleSubgraph(const ImmutableGraph *graph,
376
                        const std::vector<dgl_id_t>& seeds,
Da Zheng's avatar
Da Zheng committed
377
378
379
                        const float* probability,
                        const std::string &edge_type,
                        int num_hops,
380
381
                        size_t num_neighbor,
                        const bool add_self_loop) {
382
  unsigned int time_seed = randseed();
383
  const size_t num_seeds = seeds.size();
Da Zheng's avatar
Da Zheng committed
384
  auto orig_csr = edge_type == "in" ? graph->GetInCSR() : graph->GetOutCSR();
385
386
387
  const dgl_id_t* val_list = static_cast<dgl_id_t*>(orig_csr->edge_ids()->data);
  const dgl_id_t* col_list = static_cast<dgl_id_t*>(orig_csr->indices()->data);
  const dgl_id_t* indptr = static_cast<dgl_id_t*>(orig_csr->indptr()->data);
Da Zheng's avatar
Da Zheng committed
388
389
390
391
392
393

  std::unordered_set<dgl_id_t> sub_ver_map;  // The vertex Ids in a layer.
  std::vector<std::pair<dgl_id_t, int> > sub_vers;
  sub_vers.reserve(num_seeds * 10);
  // add seed vertices
  for (size_t i = 0; i < num_seeds; ++i) {
394
    auto ret = sub_ver_map.insert(seeds[i]);
Da Zheng's avatar
Da Zheng committed
395
396
    // If the vertex is inserted successfully.
    if (ret.second) {
397
      sub_vers.emplace_back(seeds[i], 0);
Da Zheng's avatar
Da Zheng committed
398
399
400
401
402
403
404
405
406
407
408
409
410
411
    }
  }
  std::vector<dgl_id_t> tmp_sampled_src_list;
  std::vector<dgl_id_t> tmp_sampled_edge_list;
  // ver_id, position
  std::vector<neighbor_info> neigh_pos;
  neigh_pos.reserve(num_seeds);
  std::vector<dgl_id_t> neighbor_list;
  std::vector<dgl_id_t> edge_list;
  std::vector<size_t> layer_offsets(num_hops + 1);
  int64_t num_edges = 0;

  layer_offsets[0] = 0;
  layer_offsets[1] = sub_vers.size();
412
  for (int layer_id = 1; layer_id < num_hops; layer_id++) {
Da Zheng's avatar
Da Zheng committed
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
    // We need to avoid resampling the same node in a layer, but we allow a node
    // to be resampled in multiple layers. We use `sub_ver_map` to keep track of
    // sampled nodes in a layer, and clear it when entering a new layer.
    sub_ver_map.clear();
    // Previous iteration collects all nodes in sub_vers, which are collected
    // in the previous layer. sub_vers is used both as a node collection and a queue.
    for (size_t idx = layer_offsets[layer_id - 1]; idx < layer_offsets[layer_id]; idx++) {
      dgl_id_t dst_id = sub_vers[idx].first;
      const int cur_node_level = sub_vers[idx].second;

      tmp_sampled_src_list.clear();
      tmp_sampled_edge_list.clear();
      dgl_id_t ver_len = *(indptr+dst_id+1) - *(indptr+dst_id);
      if (probability == nullptr) {  // uniform-sample
        GetUniformSample(val_list + *(indptr + dst_id),
                         col_list + *(indptr + dst_id),
                         ver_len,
                         num_neighbor,
                         &tmp_sampled_src_list,
                         &tmp_sampled_edge_list,
                         &time_seed);
      } else {  // non-uniform-sample
        GetNonUniformSample(probability,
                            val_list + *(indptr + dst_id),
                            col_list + *(indptr + dst_id),
                            ver_len,
                            num_neighbor,
                            &tmp_sampled_src_list,
                            &tmp_sampled_edge_list,
                            &time_seed);
      }
Da Zheng's avatar
Da Zheng committed
444
445
446
      // If we need to add self loop and it doesn't exist in the sampled neighbor list.
      if (add_self_loop && std::find(tmp_sampled_src_list.begin(), tmp_sampled_src_list.end(),
                                     dst_id) == tmp_sampled_src_list.end()) {
447
        tmp_sampled_src_list.push_back(dst_id);
Da Zheng's avatar
Da Zheng committed
448
449
450
451
452
453
454
455
456
457
        const dgl_id_t *src_list = col_list + *(indptr + dst_id);
        const dgl_id_t *eid_list = val_list + *(indptr + dst_id);
        // TODO(zhengda) this operation has O(N) complexity. It can be pretty slow.
        const dgl_id_t *src = std::find(src_list, src_list + ver_len, dst_id);
        // If there doesn't exist a self loop in the graph.
        // we have to add -1 as the edge id for the self-loop edge.
        if (src == src_list + ver_len)
          tmp_sampled_edge_list.push_back(-1);
        else
          tmp_sampled_edge_list.push_back(eid_list[src - src_list]);
458
      }
Da Zheng's avatar
Da Zheng committed
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
      CHECK_EQ(tmp_sampled_src_list.size(), tmp_sampled_edge_list.size());
      neigh_pos.emplace_back(dst_id, neighbor_list.size(), tmp_sampled_src_list.size());
      // Then push the vertices
      for (size_t i = 0; i < tmp_sampled_src_list.size(); ++i) {
        neighbor_list.push_back(tmp_sampled_src_list[i]);
      }
      // Finally we push the edge list
      for (size_t i = 0; i < tmp_sampled_edge_list.size(); ++i) {
        edge_list.push_back(tmp_sampled_edge_list[i]);
      }
      num_edges += tmp_sampled_src_list.size();
      for (size_t i = 0; i < tmp_sampled_src_list.size(); ++i) {
        // We need to add the neighbor in the hashtable here. This ensures that
        // the vertex in the queue is unique. If we see a vertex before, we don't
        // need to add it to the queue again.
        auto ret = sub_ver_map.insert(tmp_sampled_src_list[i]);
        // If the sampled neighbor is inserted to the map successfully.
        if (ret.second) {
          sub_vers.emplace_back(tmp_sampled_src_list[i], cur_node_level + 1);
        }
      }
    }
    layer_offsets[layer_id + 1] = layer_offsets[layer_id] + sub_ver_map.size();
    CHECK_EQ(layer_offsets[layer_id + 1], sub_vers.size());
  }

  return ConstructNodeFlow(neighbor_list, edge_list, layer_offsets, &sub_vers, &neigh_pos,
                           edge_type, num_edges, num_hops, graph->IsMultigraph());
}

489
}  // namespace
Da Zheng's avatar
Da Zheng committed
490

491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
DGL_REGISTER_GLOBAL("nodeflow._CAPI_NodeFlowGetGraph")
.set_body([] (DGLArgs args, DGLRetValue* rv) {
    void* ptr = args[0];
    const NodeFlow* nflow = static_cast<NodeFlow*>(ptr);
    GraphInterface* gptr = nflow->graph->Reset();
    *rv = gptr;
  });

DGL_REGISTER_GLOBAL("nodeflow._CAPI_NodeFlowGetNodeMapping")
.set_body([] (DGLArgs args, DGLRetValue* rv) {
    void* ptr = args[0];
    const NodeFlow* nflow = static_cast<NodeFlow*>(ptr);
    *rv = nflow->node_mapping;
  });

DGL_REGISTER_GLOBAL("nodeflow._CAPI_NodeFlowGetEdgeMapping")
.set_body([] (DGLArgs args, DGLRetValue* rv) {
    void* ptr = args[0];
    const NodeFlow* nflow = static_cast<NodeFlow*>(ptr);
    *rv = nflow->edge_mapping;
  });

DGL_REGISTER_GLOBAL("nodeflow._CAPI_NodeFlowGetLayerOffsets")
.set_body([] (DGLArgs args, DGLRetValue* rv) {
    void* ptr = args[0];
    const NodeFlow* nflow = static_cast<NodeFlow*>(ptr);
    *rv = nflow->layer_offsets;
  });

DGL_REGISTER_GLOBAL("nodeflow._CAPI_NodeFlowGetBlockOffsets")
.set_body([] (DGLArgs args, DGLRetValue* rv) {
    void* ptr = args[0];
    const NodeFlow* nflow = static_cast<NodeFlow*>(ptr);
    *rv = nflow->flow_offsets;
  });

DGL_REGISTER_GLOBAL("nodeflow._CAPI_NodeFlowFree")
.set_body([] (DGLArgs args, DGLRetValue* rv) {
    void* ptr = args[0];
    NodeFlow* nflow = static_cast<NodeFlow*>(ptr);
    delete nflow;
  });

NodeFlow SamplerOp::NeighborUniformSample(const ImmutableGraph *graph,
                                          const std::vector<dgl_id_t>& seeds,
Da Zheng's avatar
Da Zheng committed
536
                                          const std::string &edge_type,
537
538
                                          int num_hops, int expand_factor,
                                          const bool add_self_loop) {
Da Zheng's avatar
Da Zheng committed
539
540
541
542
543
  return SampleSubgraph(graph,
                        seeds,                 // seed vector
                        nullptr,               // sample_id_probability
                        edge_type,
                        num_hops + 1,
544
545
                        expand_factor,
                        add_self_loop);
Da Zheng's avatar
Da Zheng committed
546
547
}

548
namespace {
549
  void ConstructLayers(const dgl_id_t *indptr,
550
                       const dgl_id_t *indices,
551
552
                       const std::vector<dgl_id_t>& seed_array,
                       IdArray layer_sizes,
553
554
555
556
557
558
559
560
561
                       std::vector<dgl_id_t> *layer_offsets,
                       std::vector<dgl_id_t> *node_mapping,
                       std::vector<int64_t> *actl_layer_sizes,
                       std::vector<float> *probabilities) {
    /*
     * Given a graph and a collection of seed nodes, this function constructs NodeFlow
     * layers via uniform layer-wise sampling, and return the resultant layers and their
     * corresponding probabilities.
     */
562
    std::copy(seed_array.begin(), seed_array.end(), std::back_inserter(*node_mapping));
563
564
    actl_layer_sizes->push_back(node_mapping->size());
    probabilities->insert(probabilities->end(), node_mapping->size(), 1);
565
566
    const int64_t* layer_sizes_data = static_cast<int64_t*>(layer_sizes->data);
    const int64_t num_layers = layer_sizes->shape[0];
567
568
569

    size_t curr = 0;
    size_t next = node_mapping->size();
570
    unsigned int rand_seed = randseed();
571
572
    for (int64_t i = num_layers - 1; i >= 0; --i) {
      const int64_t layer_size = layer_sizes_data[i];
573
574
575
576
577
578
579
580
581
582
583
584
      std::unordered_set<dgl_id_t> candidate_set;
      for (auto j = curr; j != next; ++j) {
        auto src = (*node_mapping)[j];
        candidate_set.insert(indices + indptr[src], indices + indptr[src + 1]);
      }

      std::vector<dgl_id_t> candidate_vector;
      std::copy(candidate_set.begin(), candidate_set.end(),
                std::back_inserter(candidate_vector));

      std::unordered_map<dgl_id_t, size_t> n_occurrences;
      auto n_candidates = candidate_vector.size();
585
      for (int64_t j = 0; j != layer_size; ++j) {
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
        auto dst = candidate_vector[rand_r(&rand_seed) % n_candidates];
        if (!n_occurrences.insert(std::make_pair(dst, 1)).second) {
          ++n_occurrences[dst];
        }
      }

      for (auto const &pair : n_occurrences) {
        node_mapping->push_back(pair.first);
        float p = pair.second * n_candidates / static_cast<float>(layer_size);
        probabilities->push_back(p);
      }

      actl_layer_sizes->push_back(node_mapping->size() - next);
      curr = next;
      next = node_mapping->size();
    }
    std::reverse(node_mapping->begin(), node_mapping->end());
    std::reverse(actl_layer_sizes->begin(), actl_layer_sizes->end());
    layer_offsets->push_back(0);
    for (const auto &size : *actl_layer_sizes) {
      layer_offsets->push_back(size + layer_offsets->back());
    }
  }

610
  void ConstructFlows(const dgl_id_t *indptr,
611
612
613
614
                      const dgl_id_t *indices,
                      const dgl_id_t *eids,
                      const std::vector<dgl_id_t> &node_mapping,
                      const std::vector<int64_t> &actl_layer_sizes,
615
616
617
                      std::vector<dgl_id_t> *sub_indptr,
                      std::vector<dgl_id_t> *sub_indices,
                      std::vector<dgl_id_t> *sub_eids,
618
619
620
621
622
623
624
                      std::vector<dgl_id_t> *flow_offsets,
                      std::vector<dgl_id_t> *edge_mapping) {
    /*
     * Given a graph and a sequence of NodeFlow layers, this function constructs dense
     * subgraphs (flows) between consecutive layers.
     */
    auto n_flows = actl_layer_sizes.size() - 1;
625
626
    for (int64_t i = 0; i < actl_layer_sizes.front() + 1; i++)
      sub_indptr->push_back(0);
627
628
629
630
631
632
633
634
635
636
637
638
639
    flow_offsets->push_back(0);
    int64_t first = 0;
    for (size_t i = 0; i < n_flows; ++i) {
      auto src_size = actl_layer_sizes[i];
      std::unordered_map<dgl_id_t, dgl_id_t> source_map;
      for (int64_t j = 0; j < src_size; ++j) {
        source_map.insert(std::make_pair(node_mapping[first + j], first + j));
      }
      auto dst_size = actl_layer_sizes[i + 1];
      for (int64_t j = 0; j < dst_size; ++j) {
        auto dst = node_mapping[first + src_size + j];
        typedef std::pair<dgl_id_t, dgl_id_t> id_pair;
        std::vector<id_pair> neighbor_indices;
640
        for (dgl_id_t k = indptr[dst]; k < indptr[dst + 1]; ++k) {
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
          // TODO(gaiyu): accelerate hash table lookup
          auto ret = source_map.find(indices[k]);
          if (ret != source_map.end()) {
            neighbor_indices.push_back(std::make_pair(ret->second, eids[k]));
          }
        }
        auto cmp = [](const id_pair p, const id_pair q)->bool { return p.first < q.first; };
        std::sort(neighbor_indices.begin(), neighbor_indices.end(), cmp);
        for (const auto &pair : neighbor_indices) {
          sub_indices->push_back(pair.first);
          edge_mapping->push_back(pair.second);
        }
        sub_indptr->push_back(sub_indices->size());
      }
      flow_offsets->push_back(sub_indices->size());
      first += src_size;
    }
    sub_eids->resize(sub_indices->size());
    std::iota(sub_eids->begin(), sub_eids->end(), 0);
  }
}  // namespace

NodeFlow SamplerOp::LayerUniformSample(const ImmutableGraph *graph,
664
                                       const std::vector<dgl_id_t>& seeds,
665
                                       const std::string &neighbor_type,
666
                                       IdArray layer_sizes) {
667
  const auto g_csr = neighbor_type == "in" ? graph->GetInCSR() : graph->GetOutCSR();
668
669
670
  const dgl_id_t *indptr = static_cast<dgl_id_t*>(g_csr->indptr()->data);
  const dgl_id_t *indices = static_cast<dgl_id_t*>(g_csr->indices()->data);
  const dgl_id_t *eids = static_cast<dgl_id_t*>(g_csr->edge_ids()->data);
671
672
673
674
675
676
677

  std::vector<dgl_id_t> layer_offsets;
  std::vector<dgl_id_t> node_mapping;
  std::vector<int64_t> actl_layer_sizes;
  std::vector<float> probabilities;
  ConstructLayers(indptr,
                  indices,
678
                  seeds,
679
680
681
682
683
684
                  layer_sizes,
                  &layer_offsets,
                  &node_mapping,
                  &actl_layer_sizes,
                  &probabilities);

685
  std::vector<dgl_id_t> sub_indptr, sub_indices, sub_edge_ids;
686
687
688
689
690
691
692
  std::vector<dgl_id_t> flow_offsets;
  std::vector<dgl_id_t> edge_mapping;
  ConstructFlows(indptr,
                 indices,
                 eids,
                 node_mapping,
                 actl_layer_sizes,
693
694
695
                 &sub_indptr,
                 &sub_indices,
                 &sub_edge_ids,
696
697
                 &flow_offsets,
                 &edge_mapping);
698
699
700
701
702
  // sanity check
  CHECK_GT(sub_indptr.size(), 0);
  CHECK_EQ(sub_indptr[0], 0);
  CHECK_EQ(sub_indptr.back(), sub_indices.size());
  CHECK_EQ(sub_indices.size(), sub_edge_ids.size());
703

704
705
706
707
708
709
  NodeFlow nf;
  auto sub_csr = CSRPtr(new CSR(
        VecToIdArray(sub_indptr), VecToIdArray(sub_indices), VecToIdArray(sub_edge_ids)));

  if (neighbor_type == std::string("in")) {
    nf.graph = GraphPtr(new ImmutableGraph(sub_csr, nullptr));
710
  } else {
711
    nf.graph = GraphPtr(new ImmutableGraph(nullptr, sub_csr));
712
713
  }

714
715
716
717
  nf.node_mapping = VecToIdArray(node_mapping);
  nf.edge_mapping = VecToIdArray(edge_mapping);
  nf.layer_offsets = VecToIdArray(layer_offsets);
  nf.flow_offsets = VecToIdArray(flow_offsets);
718
719
720
721

  return nf;
}

Da Zheng's avatar
Da Zheng committed
722
723
724
725
726
727
728
729
730
731
732
733
void BuildCsr(const ImmutableGraph &g, const std::string neigh_type) {
  if (neigh_type == "in") {
    auto csr = g.GetInCSR();
    assert(csr);
  } else if (neigh_type == "out") {
    auto csr = g.GetOutCSR();
    assert(csr);
  } else {
    LOG(FATAL) << "We don't support sample from neighbor type " << neigh_type;
  }
}

734
735
736
737
DGL_REGISTER_GLOBAL("sampling._CAPI_UniformSampling")
.set_body([] (DGLArgs args, DGLRetValue* rv) {
    // arguments
    const GraphHandle ghdl = args[0];
738
    const IdArray seed_nodes = args[1];
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
    const int64_t batch_start_id = args[2];
    const int64_t batch_size = args[3];
    const int64_t max_num_workers = args[4];
    const int64_t expand_factor = args[5];
    const int64_t num_hops = args[6];
    const std::string neigh_type = args[7];
    const bool add_self_loop = args[8];
    // process args
    const GraphInterface *ptr = static_cast<const GraphInterface *>(ghdl);
    const ImmutableGraph *gptr = dynamic_cast<const ImmutableGraph*>(ptr);
    CHECK(gptr) << "sampling isn't implemented in mutable graph";
    CHECK(IsValidIdArray(seed_nodes));
    const dgl_id_t* seed_nodes_data = static_cast<dgl_id_t*>(seed_nodes->data);
    const int64_t num_seeds = seed_nodes->shape[0];
    const int64_t num_workers = std::min(max_num_workers,
        (num_seeds + batch_size - 1) / batch_size - batch_start_id);
Da Zheng's avatar
Da Zheng committed
755
756
    // We need to make sure we have the right CSR before we enter parallel sampling.
    BuildCsr(*gptr, neigh_type);
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
    // generate node flows
    std::vector<NodeFlow*> nflows(num_workers);
#pragma omp parallel for
    for (int i = 0; i < num_workers; i++) {
      // create per-worker seed nodes.
      const int64_t start = (batch_start_id + i) * batch_size;
      const int64_t end = std::min(start + batch_size, num_seeds);
      // TODO(minjie): the vector allocation/copy is unnecessary
      std::vector<dgl_id_t> worker_seeds(end - start);
      std::copy(seed_nodes_data + start, seed_nodes_data + end,
                worker_seeds.begin());
      nflows[i] = new NodeFlow();
      *nflows[i] = SamplerOp::NeighborUniformSample(
          gptr, worker_seeds, neigh_type, num_hops, expand_factor, add_self_loop);
    }
    *rv = WrapVectorReturn(nflows);
  });

DGL_REGISTER_GLOBAL("sampling._CAPI_LayerSampling")
.set_body([] (DGLArgs args, DGLRetValue* rv) {
    // arguments
    const GraphHandle ghdl = args[0];
779
    const IdArray seed_nodes = args[1];
780
781
782
    const int64_t batch_start_id = args[2];
    const int64_t batch_size = args[3];
    const int64_t max_num_workers = args[4];
783
    const IdArray layer_sizes = args[5];
784
785
786
787
788
789
790
791
792
793
    const std::string neigh_type = args[6];
    // process args
    const GraphInterface *ptr = static_cast<const GraphInterface *>(ghdl);
    const ImmutableGraph *gptr = dynamic_cast<const ImmutableGraph*>(ptr);
    CHECK(gptr) << "sampling isn't implemented in mutable graph";
    CHECK(IsValidIdArray(seed_nodes));
    const dgl_id_t* seed_nodes_data = static_cast<dgl_id_t*>(seed_nodes->data);
    const int64_t num_seeds = seed_nodes->shape[0];
    const int64_t num_workers = std::min(max_num_workers,
        (num_seeds + batch_size - 1) / batch_size - batch_start_id);
Da Zheng's avatar
Da Zheng committed
794
795
    // We need to make sure we have the right CSR before we enter parallel sampling.
    BuildCsr(*gptr, neigh_type);
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
    // generate node flows
    std::vector<NodeFlow*> nflows(num_workers);
#pragma omp parallel for
    for (int i = 0; i < num_workers; i++) {
      // create per-worker seed nodes.
      const int64_t start = (batch_start_id + i) * batch_size;
      const int64_t end = std::min(start + batch_size, num_seeds);
      // TODO(minjie): the vector allocation/copy is unnecessary
      std::vector<dgl_id_t> worker_seeds(end - start);
      std::copy(seed_nodes_data + start, seed_nodes_data + end,
                worker_seeds.begin());
      nflows[i] = new NodeFlow();
      *nflows[i] = SamplerOp::LayerUniformSample(
          gptr, worker_seeds, neigh_type, layer_sizes);
    }
    *rv = WrapVectorReturn(nflows);
  });

Da Zheng's avatar
Da Zheng committed
814
}  // namespace dgl