edge_coarsening_impl.cu 8.64 KB
Newer Older
1
/**
2
 *  Copyright (c) 2019 by Contributors
3
4
 * @file geometry/cuda/edge_coarsening_impl.cu
 * @brief Edge coarsening CUDA implementation
5
 */
6
#include <curand_kernel.h>
7
8
9
#include <dgl/array.h>
#include <dgl/random.h>
#include <dmlc/thread_local.h>
10

11
#include <cstdint>
12

13
#include "../../array/cuda/utils.h"
14
15
#include "../../runtime/cuda/cuda_common.h"
#include "../geometry_op.h"
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30

#define BLOCKS(N, T) (N + T - 1) / T

namespace dgl {
namespace geometry {
namespace impl {

constexpr float BLUE_P = 0.53406;
constexpr int BLUE = -1;
constexpr int RED = -2;
constexpr int EMPTY_IDX = -1;

__device__ bool done_d;
__global__ void init_done_kernel() { done_d = true; }

31
32
__global__ void generate_uniform_kernel(
    float *ret_values, size_t num, uint64_t seed) {
33
34
35
36
37
38
39
40
  size_t id = blockIdx.x * blockDim.x + threadIdx.x;
  if (id < num) {
    curandState state;
    curand_init(seed, id, 0, &state);
    ret_values[id] = curand_uniform(&state);
  }
}

41
template <typename IdType>
42
43
__global__ void colorize_kernel(
    const float *prop, int64_t num_elem, IdType *result) {
44
45
46
47
48
49
50
51
52
53
  const IdType idx = blockIdx.x * blockDim.x + threadIdx.x;
  if (idx < num_elem) {
    if (result[idx] < 0) {  // if unmatched
      result[idx] = (prop[idx] > BLUE_P) ? RED : BLUE;
      done_d = false;
    }
  }
}

template <typename FloatType, typename IdType>
54
55
56
__global__ void weighted_propose_kernel(
    const IdType *indptr, const IdType *indices, const FloatType *weights,
    int64_t num_elem, IdType *proposal, IdType *result) {
57
58
59
60
61
62
63
64
65
66
67
  const IdType idx = blockIdx.x * blockDim.x + threadIdx.x;
  if (idx < num_elem) {
    if (result[idx] != BLUE) return;

    bool has_unmatched_neighbor = false;
    FloatType weight_max = 0.;
    IdType v_max = EMPTY_IDX;

    for (IdType i = indptr[idx]; i < indptr[idx + 1]; ++i) {
      auto v = indices[i];

68
      if (result[v] < 0) has_unmatched_neighbor = true;
69
70
71
72
73
74
75
      if (result[v] == RED && weights[i] >= weight_max) {
        v_max = v;
        weight_max = weights[i];
      }
    }

    proposal[idx] = v_max;
76
    if (!has_unmatched_neighbor) result[idx] = idx;
77
78
79
80
  }
}

template <typename FloatType, typename IdType>
81
82
83
__global__ void weighted_respond_kernel(
    const IdType *indptr, const IdType *indices, const FloatType *weights,
    int64_t num_elem, IdType *proposal, IdType *result) {
84
85
86
87
88
89
90
91
92
93
94
95
96
97
  const IdType idx = blockIdx.x * blockDim.x + threadIdx.x;
  if (idx < num_elem) {
    if (result[idx] != RED) return;

    bool has_unmatched_neighbors = false;
    IdType v_max = -1;
    FloatType weight_max = 0.;

    for (IdType i = indptr[idx]; i < indptr[idx + 1]; ++i) {
      auto v = indices[i];

      if (result[v] < 0) {
        has_unmatched_neighbors = true;
      }
98
      if (result[v] == BLUE && proposal[v] == idx && weights[i] >= weight_max) {
99
100
101
102
103
104
105
106
107
        v_max = v;
        weight_max = weights[i];
      }
    }
    if (v_max >= 0) {
      result[v_max] = min(idx, v_max);
      result[idx] = min(idx, v_max);
    }

108
    if (!has_unmatched_neighbors) result[idx] = idx;
109
110
111
  }
}

112
/** @brief The colorize procedure. This procedure randomly marks unmarked
113
114
115
 * nodes with BLUE(-1) and RED(-2) and checks whether the node matching
 * process has finished.
 */
116
117
template <typename IdType>
bool Colorize(IdType *result_data, int64_t num_nodes, float *const prop) {
118
  // initial done signal
119
120
  cudaStream_t stream = runtime::getCurrentCUDAStream();
  CUDA_KERNEL_CALL(init_done_kernel, 1, 1, 0, stream);
121
122

  // generate color prop for each node
123
124
125
  uint64_t seed = dgl::RandomEngine::ThreadLocal()->RandInt(UINT64_MAX);
  auto num_threads = cuda::FindNumThreads(num_nodes);
  auto num_blocks = cuda::FindNumBlocks<'x'>(BLOCKS(num_nodes, num_threads));
126
127
128
  CUDA_KERNEL_CALL(
      generate_uniform_kernel, num_blocks, num_threads, 0, stream, prop,
      num_nodes, seed);
129
130

  // call kernel
131
132
133
  CUDA_KERNEL_CALL(
      colorize_kernel, num_blocks, num_threads, 0, stream, prop, num_nodes,
      result_data);
134
  bool done_h = false;
135
136
  CUDA_CALL(cudaMemcpyFromSymbol(
      &done_h, done_d, sizeof(done_h), 0, cudaMemcpyDeviceToHost));
137
138
139
  return done_h;
}

140
/** @brief Weighted neighbor matching procedure (GPU version).
141
142
 * This implementation is from `A GPU Algorithm for Greedy Graph Matching
 * <http://www.staff.science.uu.nl/~bisse101/Articles/match12.pdf>`__
143
 *
144
145
146
147
148
149
150
151
152
153
154
 * This algorithm has three parts: colorize, propose and respond.
 * In colorize procedure, each unmarked node will be marked as BLUE or
 * RED randomly. If all nodes are marked, finish and return.
 * In propose procedure, each BLUE node will propose to the RED
 * neighbor with the largest weight (or randomly choose one if without weight).
 * If all its neighbors are marked, mark this node with its id.
 * In respond procedure, each RED node will respond to the BLUE neighbor
 * that has proposed to it and has the largest weight. If all neighbors
 * are marked, mark this node with its id. Else match this (BLUE, RED) node
 * pair and mark them with the smaller id between them.
 */
155
template <DGLDeviceType XPU, typename FloatType, typename IdType>
156
157
void WeightedNeighborMatching(
    const aten::CSRMatrix &csr, const NDArray weight, IdArray result) {
158
  cudaStream_t stream = runtime::getCurrentCUDAStream();
159
  const auto &ctx = result->ctx;
160
161
  auto device = runtime::DeviceAPI::Get(ctx);
  device->SetDevice(ctx);
162
163
164

  // create proposal tensor
  const int64_t num_nodes = result->shape[0];
165
  IdArray proposal = aten::Full(-1, num_nodes, sizeof(IdType) * 8, ctx);
166
167

  // get data ptrs
168
169
170
171
172
  IdType *indptr_data = static_cast<IdType *>(csr.indptr->data);
  IdType *indices_data = static_cast<IdType *>(csr.indices->data);
  IdType *result_data = static_cast<IdType *>(result->data);
  IdType *proposal_data = static_cast<IdType *>(proposal->data);
  FloatType *weight_data = static_cast<FloatType *>(weight->data);
173

174
  // allocate workspace for prop used in Colorize()
175
  float *prop = static_cast<float *>(
176
177
      device->AllocWorkspace(ctx, num_nodes * sizeof(float)));

178
179
  auto num_threads = cuda::FindNumThreads(num_nodes);
  auto num_blocks = cuda::FindNumBlocks<'x'>(BLOCKS(num_nodes, num_threads));
180
  while (!Colorize<IdType>(result_data, num_nodes, prop)) {
181
182
183
184
185
186
187
188
    CUDA_KERNEL_CALL(
        weighted_propose_kernel, num_blocks, num_threads, 0, stream,
        indptr_data, indices_data, weight_data, num_nodes, proposal_data,
        result_data);
    CUDA_KERNEL_CALL(
        weighted_respond_kernel, num_blocks, num_threads, 0, stream,
        indptr_data, indices_data, weight_data, num_nodes, proposal_data,
        result_data);
189
  }
190
  device->FreeWorkspace(ctx, prop);
191
}
192
template void WeightedNeighborMatching<kDGLCUDA, float, int32_t>(
193
    const aten::CSRMatrix &csr, const NDArray weight, IdArray result);
194
template void WeightedNeighborMatching<kDGLCUDA, float, int64_t>(
195
    const aten::CSRMatrix &csr, const NDArray weight, IdArray result);
196
template void WeightedNeighborMatching<kDGLCUDA, double, int32_t>(
197
    const aten::CSRMatrix &csr, const NDArray weight, IdArray result);
198
template void WeightedNeighborMatching<kDGLCUDA, double, int64_t>(
199
    const aten::CSRMatrix &csr, const NDArray weight, IdArray result);
200

201
/** @brief Unweighted neighbor matching procedure (GPU version).
202
203
204
205
206
207
208
209
210
 * Instead of directly sample neighbors, we assign each neighbor
 * with a random weight. We use random weight for 2 reasons:
 *  1. Random sample for each node in GPU is expensive. Although
 *     we can perform a global group-wise (neighborhood of each
 *     node as a group) random permutation as in CPU version,
 *     it still cost too much compared to directly using random weights.
 *  2. Graph is sparse, thus neighborhood of each node is small,
 *     which is suitable for GPU implementation.
 */
211
template <DGLDeviceType XPU, typename IdType>
212
213
void NeighborMatching(const aten::CSRMatrix &csr, IdArray result) {
  const int64_t num_edges = csr.indices->shape[0];
214
  const auto &ctx = result->ctx;
215
216
  auto device = runtime::DeviceAPI::Get(ctx);
  device->SetDevice(ctx);
217
218

  // generate random weights
219
  cudaStream_t stream = runtime::getCurrentCUDAStream();
220
  NDArray weight = NDArray::Empty(
221
222
      {num_edges}, DGLDataType{kDGLFloat, sizeof(float) * 8, 1}, ctx);
  float *weight_data = static_cast<float *>(weight->data);
223
224
225
  uint64_t seed = dgl::RandomEngine::ThreadLocal()->RandInt(UINT64_MAX);
  auto num_threads = cuda::FindNumThreads(num_edges);
  auto num_blocks = cuda::FindNumBlocks<'x'>(BLOCKS(num_edges, num_threads));
226
227
228
  CUDA_KERNEL_CALL(
      generate_uniform_kernel, num_blocks, num_threads, 0, stream, weight_data,
      num_edges, seed);
229
230
231

  WeightedNeighborMatching<XPU, float, IdType>(csr, weight, result);
}
232
233
234
235
template void NeighborMatching<kDGLCUDA, int32_t>(
    const aten::CSRMatrix &csr, IdArray result);
template void NeighborMatching<kDGLCUDA, int64_t>(
    const aten::CSRMatrix &csr, IdArray result);
236
237
238
239

}  // namespace impl
}  // namespace geometry
}  // namespace dgl