coo2csr.hip 4.36 KB
Newer Older
sangwzh's avatar
sangwzh committed
1
2
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
3
/**
4
 *  Copyright (c) 2020 by Contributors
5
6
 * @file array/cuda/coo2csr.cc
 * @brief COO2CSR
7
8
 */
#include <dgl/array.h>
sangwzh's avatar
sangwzh committed
9
10
#include "../../../include/dgl/array.h"

11

12
#include "../../runtime/cuda/cuda_common.h"
sangwzh's avatar
sangwzh committed
13
#include "utils.h"
14
15
16
17
18
19
20
21

namespace dgl {

using runtime::NDArray;

namespace aten {
namespace impl {

22
template <DGLDeviceType XPU, typename IdType>
23
24
25
26
27
28
CSRMatrix COOToCSR(COOMatrix coo) {
  LOG(FATAL) << "Unreachable code.";
  return {};
}

template <>
29
CSRMatrix COOToCSR<kDGLCUDA, int32_t>(COOMatrix coo) {
30
  auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
sangwzh's avatar
sangwzh committed
31
  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
32
33
  // allocate cusparse handle if needed
  if (!thr_entry->cusparse_handle) {
sangwzh's avatar
sangwzh committed
34
    CUSPARSE_CALL(hipsparseCreate(&(thr_entry->cusparse_handle)));
35
  }
sangwzh's avatar
sangwzh committed
36
  CUSPARSE_CALL(hipsparseSetStream(thr_entry->cusparse_handle, stream));
37
38
39
40

  bool row_sorted = coo.row_sorted;
  bool col_sorted = coo.col_sorted;
  if (!row_sorted) {
41
42
    // we only need to sort the rows to perform conversion
    coo = COOSort(coo, false);
43
44
45
46
47
48
49
50
51
52
53
    col_sorted = coo.col_sorted;
  }

  const int64_t nnz = coo.row->shape[0];
  // TODO(minjie): Many of our current implementation assumes that CSR must have
  //   a data array. This is a temporary workaround. Remove this after:
  //   - The old immutable graph implementation is deprecated.
  //   - The old binary reduce kernel is deprecated.
  if (!COOHasData(coo))
    coo.data = aten::Range(0, nnz, coo.row->dtype.bits, coo.row->ctx);

54
55
  NDArray indptr =
      aten::NewIdArray(coo.num_rows + 1, coo.row->ctx, coo.row->dtype.bits);
56
  int32_t* indptr_ptr = static_cast<int32_t*>(indptr->data);
sangwzh's avatar
sangwzh committed
57
  CUSPARSE_CALL(hipsparseXcoo2csr(
58
      thr_entry->cusparse_handle, coo.row.Ptr<int32_t>(), nnz, coo.num_rows,
sangwzh's avatar
sangwzh committed
59
      indptr_ptr, HIPSPARSE_INDEX_BASE_ZERO));
60
61
62

  return CSRMatrix(
      coo.num_rows, coo.num_cols, indptr, coo.col, coo.data, col_sorted);
63
64
}

65
/**
66
 * @brief Search for the insertion positions for needle in the hay.
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
 *
 * The hay is a list of sorted elements and the result is the insertion position
 * of each needle so that the insertion still gives sorted order.
 *
 * It essentially perform binary search to find upper bound for each needle
 * elements.
 *
 * For example:
 * hay = [0, 0, 1, 2, 2]
 * needle = [0, 1, 2, 3]
 * then,
 * out = [2, 3, 5, 5]
 */
template <typename IdType>
__global__ void _SortedSearchKernelUpperBound(
82
83
    const IdType* hay, int64_t hay_size, const IdType* needles,
    int64_t num_needles, IdType* pos) {
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
  int tx = blockIdx.x * blockDim.x + threadIdx.x;
  const int stride_x = gridDim.x * blockDim.x;
  while (tx < num_needles) {
    const IdType ele = needles[tx];
    // binary search
    IdType lo = 0, hi = hay_size;
    while (lo < hi) {
      IdType mid = (lo + hi) >> 1;
      if (hay[mid] <= ele) {
        lo = mid + 1;
      } else {
        hi = mid;
      }
    }
    pos[tx] = lo;
    tx += stride_x;
  }
}

template <>
104
CSRMatrix COOToCSR<kDGLCUDA, int64_t>(COOMatrix coo) {
105
106
  const auto& ctx = coo.row->ctx;
  const auto nbits = coo.row->dtype.bits;
sangwzh's avatar
sangwzh committed
107
  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
108
109
110
  bool row_sorted = coo.row_sorted;
  bool col_sorted = coo.col_sorted;
  if (!row_sorted) {
111
    coo = COOSort(coo, false);
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
    col_sorted = coo.col_sorted;
  }

  const int64_t nnz = coo.row->shape[0];
  // TODO(minjie): Many of our current implementation assumes that CSR must have
  //   a data array. This is a temporary workaround. Remove this after:
  //   - The old immutable graph implementation is deprecated.
  //   - The old binary reduce kernel is deprecated.
  if (!COOHasData(coo))
    coo.data = aten::Range(0, nnz, coo.row->dtype.bits, coo.row->ctx);

  IdArray rowids = Range(0, coo.num_rows, nbits, ctx);
  const int nt = cuda::FindNumThreads(coo.num_rows);
  const int nb = (coo.num_rows + nt - 1) / nt;
  IdArray indptr = Full(0, coo.num_rows + 1, nbits, ctx);
127
128
129
130
131
132
  CUDA_KERNEL_CALL(
      _SortedSearchKernelUpperBound, nb, nt, 0, stream, coo.row.Ptr<int64_t>(),
      nnz, rowids.Ptr<int64_t>(), coo.num_rows, indptr.Ptr<int64_t>() + 1);

  return CSRMatrix(
      coo.num_rows, coo.num_cols, indptr, coo.col, coo.data, col_sorted);
133
134
}

135
136
template CSRMatrix COOToCSR<kDGLCUDA, int32_t>(COOMatrix coo);
template CSRMatrix COOToCSR<kDGLCUDA, int64_t>(COOMatrix coo);
137
138
139
140

}  // namespace impl
}  // namespace aten
}  // namespace dgl