csr2coo.hip 7.69 KB
Newer Older
sangwzh's avatar
sangwzh committed
1
2
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
3
/**
4
 *  Copyright (c) 2020 by Contributors
5
6
 * @file array/cuda/csr2coo.cc
 * @brief CSR2COO
7
8
 */
#include <dgl/array.h>
9
10
11
#include <thrust/iterator/constant_iterator.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>
sangwzh's avatar
sangwzh committed
12
#include <hipcub/backend/rocprim/device/device_copy.hpp>
13

sangwzh's avatar
sangwzh committed
14
#include <hipcub/hipcub.hpp>
15

16
#include "../../runtime/cuda/cuda_common.h"
sangwzh's avatar
sangwzh committed
17
#include "utils.h"
18
19
20
21
22
23
24
25

namespace dgl {

using runtime::NDArray;

namespace aten {
namespace impl {

26
template <DGLDeviceType XPU, typename IdType>
27
COOMatrix CSRToCOO(CSRMatrix csr) {
28
29
30
31
32
  LOG(FATAL) << "Unreachable codes";
  return {};
}

template <>
33
COOMatrix CSRToCOO<kDGLCUDA, int32_t>(CSRMatrix csr) {
34
  auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
sangwzh's avatar
sangwzh committed
35
  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
36
37
  // allocate cusparse handle if needed
  if (!thr_entry->cusparse_handle) {
sangwzh's avatar
sangwzh committed
38
    CUSPARSE_CALL(hipsparseCreate(&(thr_entry->cusparse_handle)));
39
  }
sangwzh's avatar
sangwzh committed
40
  CUSPARSE_CALL(hipsparseSetStream(thr_entry->cusparse_handle, stream));
41
42
43

  NDArray indptr = csr.indptr, indices = csr.indices, data = csr.data;
  const int32_t* indptr_ptr = static_cast<int32_t*>(indptr->data);
44
45
  NDArray row =
      aten::NewIdArray(indices->shape[0], indptr->ctx, indptr->dtype.bits);
46
47
  int32_t* row_ptr = static_cast<int32_t*>(row->data);

sangwzh's avatar
sangwzh committed
48
  CUSPARSE_CALL(hipsparseXcsr2coo(
49
      thr_entry->cusparse_handle, indptr_ptr, indices->shape[0], csr.num_rows,
sangwzh's avatar
sangwzh committed
50
      row_ptr, HIPSPARSE_INDEX_BASE_ZERO));
51
52
53

  return COOMatrix(
      csr.num_rows, csr.num_cols, row, indices, data, true, csr.sorted);
54
55
}

56
57
58
59
struct RepeatIndex {
  template <typename IdType>
  __host__ __device__ auto operator()(IdType i) {
    return thrust::make_constant_iterator(i);
60
  }
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
};

template <typename IdType>
struct OutputBufferIndexer {
  const IdType* indptr;
  IdType* buffer;
  __host__ __device__ auto operator()(IdType i) { return buffer + indptr[i]; }
};

template <typename IdType>
struct AdjacentDifference {
  const IdType* indptr;
  __host__ __device__ auto operator()(IdType i) {
    return indptr[i + 1] - indptr[i];
  }
};
77

sangwzh's avatar
sangwzh committed
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
/*!
 * \brief Repeat elements
 * \param val Value to repeat
 * \param repeats Number of repeats for each value
 * \param pos The position of the output buffer to write the value.
 * \param out Output buffer.
 * \param length Number of values
 *
 * For example:
 * val = [3, 0, 1]
 * repeats = [1, 0, 2]
 * pos = [0, 1, 1]  # write to output buffer position 0, 1, 1
 * then,
 * out = [3, 1, 1]
 */
template <typename DType, typename IdType>
__global__ void _RepeatKernel(
    const DType* val, const IdType* pos,
    DType* out, int64_t n_row, int64_t length) {
  IdType tx = static_cast<IdType>(blockIdx.x) * blockDim.x + threadIdx.x;
  const int stride_x = gridDim.x * blockDim.x;
  while (tx < length) {
    IdType i = dgl::cuda::_UpperBound(pos, n_row, tx) - 1;
    out[tx] = val[i];
    tx += stride_x;
  }
}


sangwzh's avatar
sangwzh committed
107
#if 1
108
template <>
109
COOMatrix CSRToCOO<kDGLCUDA, int64_t>(CSRMatrix csr) {
110
  const auto& ctx = csr.indptr->ctx;
sangwzh's avatar
sangwzh committed
111
  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
112

113
114
115
116
  const int64_t nnz = csr.indices->shape[0];
  const auto nbits = csr.indptr->dtype.bits;
  IdArray ret_row = NewIdArray(nnz, ctx, nbits);

117
118
119
120
121
122
123
124
125
126
127
128
129
  runtime::CUDAWorkspaceAllocator allocator(csr.indptr->ctx);
  thrust::counting_iterator<int64_t> iota(0);

  auto input_buffer = thrust::make_transform_iterator(iota, RepeatIndex{});
  auto output_buffer = thrust::make_transform_iterator(
      iota, OutputBufferIndexer<int64_t>{
                csr.indptr.Ptr<int64_t>(), ret_row.Ptr<int64_t>()});
  auto buffer_sizes = thrust::make_transform_iterator(
      iota, AdjacentDifference<int64_t>{csr.indptr.Ptr<int64_t>()});

  constexpr int64_t max_copy_at_once = std::numeric_limits<int32_t>::max();
  for (int64_t i = 0; i < csr.num_rows; i += max_copy_at_once) {
    std::size_t temp_storage_bytes = 0;
sangwzh's avatar
sangwzh committed
130
    CUDA_CALL(hipcub::DeviceCopy::Batched(
131
        nullptr, temp_storage_bytes, input_buffer + i, output_buffer + i,
sangwzh's avatar
sangwzh committed
132
        buffer_sizes + i, ::min(csr.num_rows - i, max_copy_at_once),
133
134
135
136
        stream));

    auto temp = allocator.alloc_unique<char>(temp_storage_bytes);

sangwzh's avatar
sangwzh committed
137
    CUDA_CALL(hipcub::DeviceCopy::Batched(
138
        temp.get(), temp_storage_bytes, input_buffer + i, output_buffer + i,
sangwzh's avatar
sangwzh committed
139
        buffer_sizes + i, ::min(csr.num_rows - i, max_copy_at_once),
140
141
        stream));
  }
142
143
144
145

  return COOMatrix(
      csr.num_rows, csr.num_cols, ret_row, csr.indices, csr.data, true,
      csr.sorted);
146
}
sangwzh's avatar
sangwzh committed
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
#else
template <>
COOMatrix CSRToCOO<kDGLCUDA, int64_t>(CSRMatrix csr) {
  const auto& ctx = csr.indptr->ctx;
  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();

  const int64_t nnz = csr.indices->shape[0];
  const auto nbits = csr.indptr->dtype.bits;
  IdArray rowids = Range(0, csr.num_rows, nbits, ctx);
  IdArray ret_row = NewIdArray(nnz, ctx, nbits);

  const int nt = 256;
  const int nb = (nnz + nt - 1) / nt;
  CUDA_KERNEL_CALL(_RepeatKernel,
      nb, nt, 0, stream,
      rowids.Ptr<int64_t>(),
      csr.indptr.Ptr<int64_t>(), ret_row.Ptr<int64_t>(),
      csr.num_rows, nnz);

  return COOMatrix(csr.num_rows, csr.num_cols,
                   ret_row, csr.indices, csr.data,
                   true, csr.sorted);
}
#endif
171

172
173
template COOMatrix CSRToCOO<kDGLCUDA, int32_t>(CSRMatrix csr);
template COOMatrix CSRToCOO<kDGLCUDA, int64_t>(CSRMatrix csr);
174

175
template <DGLDeviceType XPU, typename IdType>
176
COOMatrix CSRToCOODataAsOrder(CSRMatrix csr) {
177
178
179
180
181
  LOG(FATAL) << "Unreachable codes";
  return {};
}

template <>
182
183
COOMatrix CSRToCOODataAsOrder<kDGLCUDA, int32_t>(CSRMatrix csr) {
  COOMatrix coo = CSRToCOO<kDGLCUDA, int32_t>(csr);
184
  if (aten::IsNullArray(coo.data)) return coo;
185
186
187

  auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
  auto device = runtime::DeviceAPI::Get(coo.row->ctx);
sangwzh's avatar
sangwzh committed
188
  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
189
190
  // allocate cusparse handle if needed
  if (!thr_entry->cusparse_handle) {
sangwzh's avatar
sangwzh committed
191
    CUSPARSE_CALL(hipsparseCreate(&(thr_entry->cusparse_handle)));
192
  }
sangwzh's avatar
sangwzh committed
193
  CUSPARSE_CALL(hipsparseSetStream(thr_entry->cusparse_handle, stream));
194
195
196
197
198
199
200

  NDArray row = coo.row, col = coo.col, data = coo.data;
  int32_t* row_ptr = static_cast<int32_t*>(row->data);
  int32_t* col_ptr = static_cast<int32_t*>(col->data);
  int32_t* data_ptr = static_cast<int32_t*>(data->data);

  size_t workspace_size = 0;
sangwzh's avatar
sangwzh committed
201
  CUSPARSE_CALL(hipsparseXcoosort_bufferSizeExt(
202
203
      thr_entry->cusparse_handle, coo.num_rows, coo.num_cols, row->shape[0],
      data_ptr, row_ptr, &workspace_size));
204
  void* workspace = device->AllocWorkspace(row->ctx, workspace_size);
sangwzh's avatar
sangwzh committed
205
  CUSPARSE_CALL(hipsparseXcoosortByRow(
206
207
      thr_entry->cusparse_handle, coo.num_rows, coo.num_cols, row->shape[0],
      data_ptr, row_ptr, col_ptr, workspace));
208
209
  device->FreeWorkspace(row->ctx, workspace);

210
211
212
  // The row and column field have already been reordered according
  // to data, thus the data field will be deprecated.
  coo.data = aten::NullArray();
213
214
215
216
217
218
  coo.row_sorted = false;
  coo.col_sorted = false;
  return coo;
}

template <>
219
220
COOMatrix CSRToCOODataAsOrder<kDGLCUDA, int64_t>(CSRMatrix csr) {
  COOMatrix coo = CSRToCOO<kDGLCUDA, int64_t>(csr);
221
  if (aten::IsNullArray(coo.data)) return coo;
222
223
224
225
226
227
228
229
230
231
  const auto& sorted = Sort(coo.data);

  coo.row = IndexSelect(coo.row, sorted.second);
  coo.col = IndexSelect(coo.col, sorted.second);

  // The row and column field have already been reordered according
  // to data, thus the data field will be deprecated.
  coo.data = aten::NullArray();
  coo.row_sorted = false;
  coo.col_sorted = false;
232
233
234
  return coo;
}

235
236
template COOMatrix CSRToCOODataAsOrder<kDGLCUDA, int32_t>(CSRMatrix csr);
template COOMatrix CSRToCOODataAsOrder<kDGLCUDA, int64_t>(CSRMatrix csr);
237
238
239
240

}  // namespace impl
}  // namespace aten
}  // namespace dgl