edge_coarsening_impl.hip 8.78 KB
Newer Older
sangwzh's avatar
sangwzh committed
1
2
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
3
/**
4
 *  Copyright (c) 2019 by Contributors
5
6
 * @file geometry/cuda/edge_coarsening_impl.cu
 * @brief Edge coarsening CUDA implementation
7
 */
sangwzh's avatar
sangwzh committed
8
#include <hiprand/hiprand_kernel.h>
9
10
11
#include <dgl/array.h>
#include <dgl/random.h>
#include <dmlc/thread_local.h>
12

13
#include <cstdint>
14

15
#include "../../array/cuda/utils.h"
16
17
#include "../../runtime/cuda/cuda_common.h"
#include "../geometry_op.h"
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32

#define BLOCKS(N, T) (N + T - 1) / T

namespace dgl {
namespace geometry {
namespace impl {

constexpr float BLUE_P = 0.53406;
constexpr int BLUE = -1;
constexpr int RED = -2;
constexpr int EMPTY_IDX = -1;

__device__ bool done_d;
__global__ void init_done_kernel() { done_d = true; }

33
34
__global__ void generate_uniform_kernel(
    float *ret_values, size_t num, uint64_t seed) {
35
36
  size_t id = blockIdx.x * blockDim.x + threadIdx.x;
  if (id < num) {
sangwzh's avatar
sangwzh committed
37
38
39
    hiprandState_t state;
    hiprand_init(seed, id, 0, &state);
    ret_values[id] = hiprand_uniform(&state);
40
41
42
  }
}

43
template <typename IdType>
44
45
__global__ void colorize_kernel(
    const float *prop, int64_t num_elem, IdType *result) {
46
47
48
49
50
51
52
53
54
55
  const IdType idx = blockIdx.x * blockDim.x + threadIdx.x;
  if (idx < num_elem) {
    if (result[idx] < 0) {  // if unmatched
      result[idx] = (prop[idx] > BLUE_P) ? RED : BLUE;
      done_d = false;
    }
  }
}

template <typename FloatType, typename IdType>
56
57
58
__global__ void weighted_propose_kernel(
    const IdType *indptr, const IdType *indices, const FloatType *weights,
    int64_t num_elem, IdType *proposal, IdType *result) {
59
60
61
62
63
64
65
66
67
68
69
  const IdType idx = blockIdx.x * blockDim.x + threadIdx.x;
  if (idx < num_elem) {
    if (result[idx] != BLUE) return;

    bool has_unmatched_neighbor = false;
    FloatType weight_max = 0.;
    IdType v_max = EMPTY_IDX;

    for (IdType i = indptr[idx]; i < indptr[idx + 1]; ++i) {
      auto v = indices[i];

70
      if (result[v] < 0) has_unmatched_neighbor = true;
71
72
73
74
75
76
77
      if (result[v] == RED && weights[i] >= weight_max) {
        v_max = v;
        weight_max = weights[i];
      }
    }

    proposal[idx] = v_max;
78
    if (!has_unmatched_neighbor) result[idx] = idx;
79
80
81
82
  }
}

template <typename FloatType, typename IdType>
83
84
85
__global__ void weighted_respond_kernel(
    const IdType *indptr, const IdType *indices, const FloatType *weights,
    int64_t num_elem, IdType *proposal, IdType *result) {
86
87
88
89
90
91
92
93
94
95
96
97
98
99
  const IdType idx = blockIdx.x * blockDim.x + threadIdx.x;
  if (idx < num_elem) {
    if (result[idx] != RED) return;

    bool has_unmatched_neighbors = false;
    IdType v_max = -1;
    FloatType weight_max = 0.;

    for (IdType i = indptr[idx]; i < indptr[idx + 1]; ++i) {
      auto v = indices[i];

      if (result[v] < 0) {
        has_unmatched_neighbors = true;
      }
100
      if (result[v] == BLUE && proposal[v] == idx && weights[i] >= weight_max) {
101
102
103
104
105
106
107
108
109
        v_max = v;
        weight_max = weights[i];
      }
    }
    if (v_max >= 0) {
      result[v_max] = min(idx, v_max);
      result[idx] = min(idx, v_max);
    }

110
    if (!has_unmatched_neighbors) result[idx] = idx;
111
112
113
  }
}

114
/** @brief The colorize procedure. This procedure randomly marks unmarked
115
116
117
 * nodes with BLUE(-1) and RED(-2) and checks whether the node matching
 * process has finished.
 */
118
119
template <typename IdType>
bool Colorize(IdType *result_data, int64_t num_nodes, float *const prop) {
120
  // initial done signal
sangwzh's avatar
sangwzh committed
121
  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
122
  CUDA_KERNEL_CALL(init_done_kernel, 1, 1, 0, stream);
123
124

  // generate color prop for each node
125
126
127
  uint64_t seed = dgl::RandomEngine::ThreadLocal()->RandInt(UINT64_MAX);
  auto num_threads = cuda::FindNumThreads(num_nodes);
  auto num_blocks = cuda::FindNumBlocks<'x'>(BLOCKS(num_nodes, num_threads));
128
129
130
  CUDA_KERNEL_CALL(
      generate_uniform_kernel, num_blocks, num_threads, 0, stream, prop,
      num_nodes, seed);
131
132

  // call kernel
133
134
135
  CUDA_KERNEL_CALL(
      colorize_kernel, num_blocks, num_threads, 0, stream, prop, num_nodes,
      result_data);
136
  bool done_h = false;
sangwzh's avatar
sangwzh committed
137
138
  CUDA_CALL(hipMemcpyFromSymbol(
      &done_h, done_d, sizeof(done_h), 0, hipMemcpyDeviceToHost));
139
140
141
  return done_h;
}

142
/** @brief Weighted neighbor matching procedure (GPU version).
143
144
 * This implementation is from `A GPU Algorithm for Greedy Graph Matching
 * <http://www.staff.science.uu.nl/~bisse101/Articles/match12.pdf>`__
145
 *
146
147
148
149
150
151
152
153
154
155
156
 * This algorithm has three parts: colorize, propose and respond.
 * In colorize procedure, each unmarked node will be marked as BLUE or
 * RED randomly. If all nodes are marked, finish and return.
 * In propose procedure, each BLUE node will propose to the RED
 * neighbor with the largest weight (or randomly choose one if without weight).
 * If all its neighbors are marked, mark this node with its id.
 * In respond procedure, each RED node will respond to the BLUE neighbor
 * that has proposed to it and has the largest weight. If all neighbors
 * are marked, mark this node with its id. Else match this (BLUE, RED) node
 * pair and mark them with the smaller id between them.
 */
157
template <DGLDeviceType XPU, typename FloatType, typename IdType>
158
159
void WeightedNeighborMatching(
    const aten::CSRMatrix &csr, const NDArray weight, IdArray result) {
sangwzh's avatar
sangwzh committed
160
  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
161
  const auto &ctx = result->ctx;
162
163
  auto device = runtime::DeviceAPI::Get(ctx);
  device->SetDevice(ctx);
164
165
166

  // create proposal tensor
  const int64_t num_nodes = result->shape[0];
167
  IdArray proposal = aten::Full(-1, num_nodes, sizeof(IdType) * 8, ctx);
168
169

  // get data ptrs
170
171
172
173
174
  IdType *indptr_data = static_cast<IdType *>(csr.indptr->data);
  IdType *indices_data = static_cast<IdType *>(csr.indices->data);
  IdType *result_data = static_cast<IdType *>(result->data);
  IdType *proposal_data = static_cast<IdType *>(proposal->data);
  FloatType *weight_data = static_cast<FloatType *>(weight->data);
175

176
  // allocate workspace for prop used in Colorize()
177
  float *prop = static_cast<float *>(
178
179
      device->AllocWorkspace(ctx, num_nodes * sizeof(float)));

180
181
  auto num_threads = cuda::FindNumThreads(num_nodes);
  auto num_blocks = cuda::FindNumBlocks<'x'>(BLOCKS(num_nodes, num_threads));
182
  while (!Colorize<IdType>(result_data, num_nodes, prop)) {
183
184
185
186
187
188
189
190
    CUDA_KERNEL_CALL(
        weighted_propose_kernel, num_blocks, num_threads, 0, stream,
        indptr_data, indices_data, weight_data, num_nodes, proposal_data,
        result_data);
    CUDA_KERNEL_CALL(
        weighted_respond_kernel, num_blocks, num_threads, 0, stream,
        indptr_data, indices_data, weight_data, num_nodes, proposal_data,
        result_data);
191
  }
192
  device->FreeWorkspace(ctx, prop);
193
}
194
template void WeightedNeighborMatching<kDGLCUDA, float, int32_t>(
195
    const aten::CSRMatrix &csr, const NDArray weight, IdArray result);
196
template void WeightedNeighborMatching<kDGLCUDA, float, int64_t>(
197
    const aten::CSRMatrix &csr, const NDArray weight, IdArray result);
198
template void WeightedNeighborMatching<kDGLCUDA, double, int32_t>(
199
    const aten::CSRMatrix &csr, const NDArray weight, IdArray result);
200
template void WeightedNeighborMatching<kDGLCUDA, double, int64_t>(
201
    const aten::CSRMatrix &csr, const NDArray weight, IdArray result);
202

203
/** @brief Unweighted neighbor matching procedure (GPU version).
204
205
206
207
208
209
210
211
212
 * Instead of directly sample neighbors, we assign each neighbor
 * with a random weight. We use random weight for 2 reasons:
 *  1. Random sample for each node in GPU is expensive. Although
 *     we can perform a global group-wise (neighborhood of each
 *     node as a group) random permutation as in CPU version,
 *     it still cost too much compared to directly using random weights.
 *  2. Graph is sparse, thus neighborhood of each node is small,
 *     which is suitable for GPU implementation.
 */
213
template <DGLDeviceType XPU, typename IdType>
214
215
void NeighborMatching(const aten::CSRMatrix &csr, IdArray result) {
  const int64_t num_edges = csr.indices->shape[0];
216
  const auto &ctx = result->ctx;
217
218
  auto device = runtime::DeviceAPI::Get(ctx);
  device->SetDevice(ctx);
219
220

  // generate random weights
sangwzh's avatar
sangwzh committed
221
  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
222
  NDArray weight = NDArray::Empty(
223
224
      {num_edges}, DGLDataType{kDGLFloat, sizeof(float) * 8, 1}, ctx);
  float *weight_data = static_cast<float *>(weight->data);
225
226
227
  uint64_t seed = dgl::RandomEngine::ThreadLocal()->RandInt(UINT64_MAX);
  auto num_threads = cuda::FindNumThreads(num_edges);
  auto num_blocks = cuda::FindNumBlocks<'x'>(BLOCKS(num_edges, num_threads));
228
229
230
  CUDA_KERNEL_CALL(
      generate_uniform_kernel, num_blocks, num_threads, 0, stream, weight_data,
      num_edges, seed);
231
232
233

  WeightedNeighborMatching<XPU, float, IdType>(csr, weight, result);
}
234
235
236
237
template void NeighborMatching<kDGLCUDA, int32_t>(
    const aten::CSRMatrix &csr, IdArray result);
template void NeighborMatching<kDGLCUDA, int64_t>(
    const aten::CSRMatrix &csr, IdArray result);
238
239
240
241

}  // namespace impl
}  // namespace geometry
}  // namespace dgl