"vscode:/vscode.git/clone" did not exist on "2bababf2f285235cd113eea2462d09fa5aa845f6"
csr2coo.hip 7.63 KB
Newer Older
sangwzh's avatar
sangwzh committed
1
2
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
3
/**
4
 *  Copyright (c) 2020 by Contributors
5
6
 * @file array/cuda/csr2coo.cc
 * @brief CSR2COO
7
8
 */
#include <dgl/array.h>
9
10
11
#include <thrust/iterator/constant_iterator.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>
12

sangwzh's avatar
sangwzh committed
13
#include <hipcub/hipcub.hpp>
14

15
#include "../../runtime/cuda/cuda_common.h"
sangwzh's avatar
sangwzh committed
16
#include "utils.h"
17
18
19
20
21
22
23
24

namespace dgl {

using runtime::NDArray;

namespace aten {
namespace impl {

25
template <DGLDeviceType XPU, typename IdType>
26
COOMatrix CSRToCOO(CSRMatrix csr) {
27
28
29
30
31
  LOG(FATAL) << "Unreachable codes";
  return {};
}

template <>
32
COOMatrix CSRToCOO<kDGLCUDA, int32_t>(CSRMatrix csr) {
33
  auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
sangwzh's avatar
sangwzh committed
34
  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
35
36
  // allocate cusparse handle if needed
  if (!thr_entry->cusparse_handle) {
sangwzh's avatar
sangwzh committed
37
    CUSPARSE_CALL(hipsparseCreate(&(thr_entry->cusparse_handle)));
38
  }
sangwzh's avatar
sangwzh committed
39
  CUSPARSE_CALL(hipsparseSetStream(thr_entry->cusparse_handle, stream));
40
41
42

  NDArray indptr = csr.indptr, indices = csr.indices, data = csr.data;
  const int32_t* indptr_ptr = static_cast<int32_t*>(indptr->data);
43
44
  NDArray row =
      aten::NewIdArray(indices->shape[0], indptr->ctx, indptr->dtype.bits);
45
46
  int32_t* row_ptr = static_cast<int32_t*>(row->data);

sangwzh's avatar
sangwzh committed
47
  CUSPARSE_CALL(hipsparseXcsr2coo(
48
      thr_entry->cusparse_handle, indptr_ptr, indices->shape[0], csr.num_rows,
sangwzh's avatar
sangwzh committed
49
      row_ptr, HIPSPARSE_INDEX_BASE_ZERO));
50
51
52

  return COOMatrix(
      csr.num_rows, csr.num_cols, row, indices, data, true, csr.sorted);
53
54
}

55
56
57
58
struct RepeatIndex {
  template <typename IdType>
  __host__ __device__ auto operator()(IdType i) {
    return thrust::make_constant_iterator(i);
59
  }
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
};

template <typename IdType>
struct OutputBufferIndexer {
  const IdType* indptr;
  IdType* buffer;
  __host__ __device__ auto operator()(IdType i) { return buffer + indptr[i]; }
};

template <typename IdType>
struct AdjacentDifference {
  const IdType* indptr;
  __host__ __device__ auto operator()(IdType i) {
    return indptr[i + 1] - indptr[i];
  }
};
76

sangwzh's avatar
sangwzh committed
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
/*!
 * \brief Repeat elements
 * \param val Value to repeat
 * \param repeats Number of repeats for each value
 * \param pos The position of the output buffer to write the value.
 * \param out Output buffer.
 * \param length Number of values
 *
 * For example:
 * val = [3, 0, 1]
 * repeats = [1, 0, 2]
 * pos = [0, 1, 1]  # write to output buffer position 0, 1, 1
 * then,
 * out = [3, 1, 1]
 */
template <typename DType, typename IdType>
__global__ void _RepeatKernel(
    const DType* val, const IdType* pos,
    DType* out, int64_t n_row, int64_t length) {
  IdType tx = static_cast<IdType>(blockIdx.x) * blockDim.x + threadIdx.x;
  const int stride_x = gridDim.x * blockDim.x;
  while (tx < length) {
    IdType i = dgl::cuda::_UpperBound(pos, n_row, tx) - 1;
    out[tx] = val[i];
    tx += stride_x;
  }
}


#if 0
107
template <>
108
COOMatrix CSRToCOO<kDGLCUDA, int64_t>(CSRMatrix csr) {
109
  const auto& ctx = csr.indptr->ctx;
sangwzh's avatar
sangwzh committed
110
  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
111

112
113
114
115
  const int64_t nnz = csr.indices->shape[0];
  const auto nbits = csr.indptr->dtype.bits;
  IdArray ret_row = NewIdArray(nnz, ctx, nbits);

116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
  runtime::CUDAWorkspaceAllocator allocator(csr.indptr->ctx);
  thrust::counting_iterator<int64_t> iota(0);

  auto input_buffer = thrust::make_transform_iterator(iota, RepeatIndex{});
  auto output_buffer = thrust::make_transform_iterator(
      iota, OutputBufferIndexer<int64_t>{
                csr.indptr.Ptr<int64_t>(), ret_row.Ptr<int64_t>()});
  auto buffer_sizes = thrust::make_transform_iterator(
      iota, AdjacentDifference<int64_t>{csr.indptr.Ptr<int64_t>()});

  constexpr int64_t max_copy_at_once = std::numeric_limits<int32_t>::max();
  for (int64_t i = 0; i < csr.num_rows; i += max_copy_at_once) {
    std::size_t temp_storage_bytes = 0;
    CUDA_CALL(cub::DeviceCopy::Batched(
        nullptr, temp_storage_bytes, input_buffer + i, output_buffer + i,
sangwzh's avatar
sangwzh committed
131
        buffer_sizes + i, ::min(csr.num_rows - i, max_copy_at_once),
132
133
134
135
136
137
        stream));

    auto temp = allocator.alloc_unique<char>(temp_storage_bytes);

    CUDA_CALL(cub::DeviceCopy::Batched(
        temp.get(), temp_storage_bytes, input_buffer + i, output_buffer + i,
sangwzh's avatar
sangwzh committed
138
        buffer_sizes + i, ::min(csr.num_rows - i, max_copy_at_once),
139
140
        stream));
  }
141
142
143
144

  return COOMatrix(
      csr.num_rows, csr.num_cols, ret_row, csr.indices, csr.data, true,
      csr.sorted);
145
}
sangwzh's avatar
sangwzh committed
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
#else
template <>
COOMatrix CSRToCOO<kDGLCUDA, int64_t>(CSRMatrix csr) {
  const auto& ctx = csr.indptr->ctx;
  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();

  const int64_t nnz = csr.indices->shape[0];
  const auto nbits = csr.indptr->dtype.bits;
  IdArray rowids = Range(0, csr.num_rows, nbits, ctx);
  IdArray ret_row = NewIdArray(nnz, ctx, nbits);

  const int nt = 256;
  const int nb = (nnz + nt - 1) / nt;
  CUDA_KERNEL_CALL(_RepeatKernel,
      nb, nt, 0, stream,
      rowids.Ptr<int64_t>(),
      csr.indptr.Ptr<int64_t>(), ret_row.Ptr<int64_t>(),
      csr.num_rows, nnz);

  return COOMatrix(csr.num_rows, csr.num_cols,
                   ret_row, csr.indices, csr.data,
                   true, csr.sorted);
}
#endif
170

171
172
template COOMatrix CSRToCOO<kDGLCUDA, int32_t>(CSRMatrix csr);
template COOMatrix CSRToCOO<kDGLCUDA, int64_t>(CSRMatrix csr);
173

174
template <DGLDeviceType XPU, typename IdType>
175
COOMatrix CSRToCOODataAsOrder(CSRMatrix csr) {
176
177
178
179
180
  LOG(FATAL) << "Unreachable codes";
  return {};
}

template <>
181
182
COOMatrix CSRToCOODataAsOrder<kDGLCUDA, int32_t>(CSRMatrix csr) {
  COOMatrix coo = CSRToCOO<kDGLCUDA, int32_t>(csr);
183
  if (aten::IsNullArray(coo.data)) return coo;
184
185
186

  auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
  auto device = runtime::DeviceAPI::Get(coo.row->ctx);
sangwzh's avatar
sangwzh committed
187
  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
188
189
  // allocate cusparse handle if needed
  if (!thr_entry->cusparse_handle) {
sangwzh's avatar
sangwzh committed
190
    CUSPARSE_CALL(hipsparseCreate(&(thr_entry->cusparse_handle)));
191
  }
sangwzh's avatar
sangwzh committed
192
  CUSPARSE_CALL(hipsparseSetStream(thr_entry->cusparse_handle, stream));
193
194
195
196
197
198
199

  NDArray row = coo.row, col = coo.col, data = coo.data;
  int32_t* row_ptr = static_cast<int32_t*>(row->data);
  int32_t* col_ptr = static_cast<int32_t*>(col->data);
  int32_t* data_ptr = static_cast<int32_t*>(data->data);

  size_t workspace_size = 0;
sangwzh's avatar
sangwzh committed
200
  CUSPARSE_CALL(hipsparseXcoosort_bufferSizeExt(
201
202
      thr_entry->cusparse_handle, coo.num_rows, coo.num_cols, row->shape[0],
      data_ptr, row_ptr, &workspace_size));
203
  void* workspace = device->AllocWorkspace(row->ctx, workspace_size);
sangwzh's avatar
sangwzh committed
204
  CUSPARSE_CALL(hipsparseXcoosortByRow(
205
206
      thr_entry->cusparse_handle, coo.num_rows, coo.num_cols, row->shape[0],
      data_ptr, row_ptr, col_ptr, workspace));
207
208
  device->FreeWorkspace(row->ctx, workspace);

209
210
211
  // The row and column field have already been reordered according
  // to data, thus the data field will be deprecated.
  coo.data = aten::NullArray();
212
213
214
215
216
217
  coo.row_sorted = false;
  coo.col_sorted = false;
  return coo;
}

template <>
218
219
COOMatrix CSRToCOODataAsOrder<kDGLCUDA, int64_t>(CSRMatrix csr) {
  COOMatrix coo = CSRToCOO<kDGLCUDA, int64_t>(csr);
220
  if (aten::IsNullArray(coo.data)) return coo;
221
222
223
224
225
226
227
228
229
230
  const auto& sorted = Sort(coo.data);

  coo.row = IndexSelect(coo.row, sorted.second);
  coo.col = IndexSelect(coo.col, sorted.second);

  // The row and column field have already been reordered according
  // to data, thus the data field will be deprecated.
  coo.data = aten::NullArray();
  coo.row_sorted = false;
  coo.col_sorted = false;
231
232
233
  return coo;
}

234
235
template COOMatrix CSRToCOODataAsOrder<kDGLCUDA, int32_t>(CSRMatrix csr);
template COOMatrix CSRToCOODataAsOrder<kDGLCUDA, int64_t>(CSRMatrix csr);
236
237
238
239

}  // namespace impl
}  // namespace aten
}  // namespace dgl