spmat_op_impl_coo.cu 4.12 KB
Newer Older
1
2
/*!
 *  Copyright (c) 2021 by contributors.
3
4
 * @file array/cuda/spmat_op_impl_coo.cu
 * @brief COO operator GPU implementation
5
6
 */
#include <dgl/array.h>
7

8
#include <numeric>
9
10
11
#include <unordered_set>
#include <vector>

12
13
#include "../../runtime/cuda/cuda_common.h"
#include "./atomic.cuh"
14
#include "./utils.h"
15
16
17
18
19
20
21
22
23
24

namespace dgl {

using runtime::NDArray;
using namespace cuda;

namespace aten {
namespace impl {

template <typename IdType>
25
__device__ void _warpReduce(volatile IdType* sdata, IdType tid) {
26
27
28
29
30
31
32
33
34
35
  sdata[tid] += sdata[tid + 32];
  sdata[tid] += sdata[tid + 16];
  sdata[tid] += sdata[tid + 8];
  sdata[tid] += sdata[tid + 4];
  sdata[tid] += sdata[tid + 2];
  sdata[tid] += sdata[tid + 1];
}

template <typename IdType>
__global__ void _COOGetRowNNZKernel(
36
37
    const IdType* __restrict__ row_indices, IdType* __restrict__ glb_cnt,
    const int64_t row_query, IdType nnz) {
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
  __shared__ IdType local_cnt[1024];
  IdType tx = threadIdx.x;
  IdType bx = blockIdx.x;
  local_cnt[tx] = 0;
  IdType start = bx * blockDim.x;
  while (start < nnz) {
    if (start + tx < nnz)
      local_cnt[tx] = (row_indices[start + tx] == row_query);
    __syncthreads();
    if (tx < 512) {
      local_cnt[tx] += local_cnt[tx + 512];
      __syncthreads();
    }
    if (tx < 256) {
      local_cnt[tx] += local_cnt[tx + 256];
      __syncthreads();
    }
    if (tx < 128) {
      local_cnt[tx] += local_cnt[tx + 128];
      __syncthreads();
    }
    if (tx < 64) {
      local_cnt[tx] += local_cnt[tx + 64];
      __syncthreads();
    }
    if (tx < 32) {
      _warpReduce(local_cnt, tx);
    }
    if (tx == 0) {
      cuda::AtomicAdd(glb_cnt, local_cnt[tx]);
    }
    start += blockDim.x * gridDim.x;
  }
}

73
template <DGLDeviceType XPU, typename IdType>
74
int64_t COOGetRowNNZ(COOMatrix coo, int64_t row) {
75
  cudaStream_t stream = runtime::getCurrentCUDAStream();
76
77
78
79
80
81
  const auto& ctx = coo.row->ctx;
  IdType nnz = coo.row->shape[0];
  IdType nt = 1024;
  IdType nb = dgl::cuda::FindNumBlocks<'x'>((nnz + nt - 1) / nt);
  NDArray rst = NDArray::Empty({1}, coo.row->dtype, coo.row->ctx);
  _Fill(rst.Ptr<IdType>(), 1, IdType(0));
82
83
84
  CUDA_KERNEL_CALL(
      _COOGetRowNNZKernel, nb, nt, 0, stream, coo.row.Ptr<IdType>(),
      rst.Ptr<IdType>(), row, nnz);
85
  rst = rst.CopyTo(DGLContext{kDGLCPU, 0});
86
87
88
  return *rst.Ptr<IdType>();
}

89
90
template int64_t COOGetRowNNZ<kDGLCUDA, int32_t>(COOMatrix, int64_t);
template int64_t COOGetRowNNZ<kDGLCUDA, int64_t>(COOMatrix, int64_t);
91
92
93

template <typename IdType>
__global__ void _COOGetAllRowNNZKernel(
94
    const IdType* __restrict__ row_indices, IdType* __restrict__ glb_cnts,
95
96
97
98
99
100
101
102
103
    IdType nnz) {
  IdType eid = blockIdx.x * blockDim.x + threadIdx.x;
  while (eid < nnz) {
    IdType row = row_indices[eid];
    cuda::AtomicAdd(glb_cnts + row, IdType(1));
    eid += blockDim.x * gridDim.x;
  }
}

104
template <DGLDeviceType XPU, typename IdType>
105
NDArray COOGetRowNNZ(COOMatrix coo, NDArray rows) {
106
  cudaStream_t stream = runtime::getCurrentCUDAStream();
107
108
109
110
111
  const auto& ctx = coo.row->ctx;
  IdType nnz = coo.row->shape[0];
  IdType num_rows = coo.num_rows;
  IdType num_queries = rows->shape[0];
  if (num_queries == 1) {
112
    auto rows_cpu = rows.CopyTo(DGLContext{kDGLCPU, 0});
113
114
115
116
117
    int64_t row = *rows_cpu.Ptr<IdType>();
    IdType nt = 1024;
    IdType nb = dgl::cuda::FindNumBlocks<'x'>((nnz + nt - 1) / nt);
    NDArray rst = NDArray::Empty({1}, coo.row->dtype, coo.row->ctx);
    _Fill(rst.Ptr<IdType>(), 1, IdType(0));
118
119
120
    CUDA_KERNEL_CALL(
        _COOGetRowNNZKernel, nb, nt, 0, stream, coo.row.Ptr<IdType>(),
        rst.Ptr<IdType>(), row, nnz);
121
122
123
124
125
126
    return rst;
  } else {
    IdType nt = 1024;
    IdType nb = dgl::cuda::FindNumBlocks<'x'>((nnz + nt - 1) / nt);
    NDArray in_degrees = NDArray::Empty({num_rows}, rows->dtype, rows->ctx);
    _Fill(in_degrees.Ptr<IdType>(), num_rows, IdType(0));
127
128
129
    CUDA_KERNEL_CALL(
        _COOGetAllRowNNZKernel, nb, nt, 0, stream, coo.row.Ptr<IdType>(),
        in_degrees.Ptr<IdType>(), nnz);
130
131
132
133
    return IndexSelect(in_degrees, rows);
  }
}

134
135
template NDArray COOGetRowNNZ<kDGLCUDA, int32_t>(COOMatrix, NDArray);
template NDArray COOGetRowNNZ<kDGLCUDA, int64_t>(COOMatrix, NDArray);
136
137
138
139

}  // namespace impl
}  // namespace aten
}  // namespace dgl