"docs/vscode:/vscode.git/clone" did not exist on "76ca91dff2e108de4b8d9770bbe44289cc16e344"
coo_sort.cu 5.61 KB
Newer Older
1
2
3
4
5
6
7
/*!
 *  Copyright (c) 2020 by Contributors
 * \file array/cuda/coo_sort.cc
 * \brief Sort COO index
 */
#include <dgl/array.h>
#include "../../runtime/cuda/cuda_common.h"
8
#include "../../c_api_common.h"
9
10
11
12
13
14
15
16
17
18
19
#include "./utils.h"

namespace dgl {

using runtime::NDArray;

namespace aten {
namespace impl {

///////////////////////////// COOSort_ /////////////////////////////

20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
/**
* @brief Encode row and column IDs into a single scalar per edge.
*
* @tparam IdType The type to encode as.
* @param row The row (src) IDs per edge.
* @param col The column (dst) IDs per edge.
* @param nnz The number of edges.
* @param col_bits The number of bits used to encode the destination. The row
* information is packed into the remaining bits.
* @param key The encoded edges (output).
*/
template <typename IdType>
__global__ void _COOEncodeEdgesKernel(
    const IdType* const row, const IdType* const col,
    const int64_t nnz, const int col_bits, IdType * const key) {

  int64_t tx = static_cast<int64_t>(blockIdx.x) * blockDim.x + threadIdx.x;

  if (tx < nnz) {
    key[tx] = row[tx] << col_bits | col[tx];
  }
41
42
}

43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
/**
* @brief Decode row and column IDs from the encoded edges.
*
* @tparam IdType The type the edges are encoded as.
* @param key The encoded edges.
* @param nnz The number of edges.
* @param col_bits The number of bits used to store the column/dst ID.
* @param row The row (src) IDs per edge (output).
* @param col The col (dst) IDs per edge (output).
*/
template <typename IdType>
__global__ void _COODecodeEdgesKernel(
    const IdType* const key, const int64_t nnz, const int col_bits,
    IdType * const row, IdType * const col) {

  int64_t tx = static_cast<int64_t>(blockIdx.x) * blockDim.x + threadIdx.x;

  if (tx < nnz) {
    const IdType k = key[tx];
    row[tx] = k >> col_bits;
    col[tx] = k & ((1 << col_bits) - 1);
64
  }
65
}
66

67
68
69
70
71
72
73


template<typename T>
int _NumberOfBits(const T& range) {
  if (range <= 1) {
    // ranges of 0 or 1 require no bits to store
    return 0;
74
75
  }

76
  int bits = 1;
77
  while (bits < static_cast<int>(sizeof(T)*8) && (1 << bits) < range) {
78
79
80
81
82
83
84
    ++bits;
  }

  CHECK_EQ((range-1) >> bits, 0);
  CHECK_NE((range-1) >> (bits-1), 0);

  return bits;
85
86
}

87
template <DGLDeviceType XPU, typename IdType>
88
void COOSort_(COOMatrix* coo, bool sort_column) {
89
  cudaStream_t stream = runtime::getCurrentCUDAStream();
90
91
92
93
94
95
96
97
98
99
100
101
102
  const int row_bits = _NumberOfBits(coo->num_rows);

  const int64_t nnz = coo->row->shape[0];
  if (sort_column) {
    const int col_bits = _NumberOfBits(coo->num_cols);
    const int num_bits = row_bits + col_bits;

    const int nt = 256;
    const int nb = (nnz+nt-1)/nt;
    CHECK(static_cast<int64_t>(nb)*nt >= nnz);

    IdArray pos = aten::NewIdArray(nnz, coo->row->ctx, coo->row->dtype.bits);

103
    CUDA_KERNEL_CALL(_COOEncodeEdgesKernel, nb, nt, 0, stream,
104
105
106
107
108
        coo->row.Ptr<IdType>(), coo->col.Ptr<IdType>(),
        nnz, col_bits, pos.Ptr<IdType>());

    auto sorted = Sort(pos, num_bits);

109
    CUDA_KERNEL_CALL(_COODecodeEdgesKernel, nb, nt, 0, stream,
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
        sorted.first.Ptr<IdType>(), nnz, col_bits,
        coo->row.Ptr<IdType>(), coo->col.Ptr<IdType>());

    if (aten::COOHasData(*coo))
      coo->data = IndexSelect(coo->data, sorted.second);
    else
      coo->data = AsNumBits(sorted.second, coo->row->dtype.bits);
    coo->row_sorted = coo->col_sorted = true;
  } else {
    const int num_bits = row_bits;

    auto sorted = Sort(coo->row, num_bits);

    coo->row = sorted.first;
    coo->col = IndexSelect(coo->col, sorted.second);

    if (aten::COOHasData(*coo))
      coo->data = IndexSelect(coo->data, sorted.second);
    else
      coo->data = AsNumBits(sorted.second, coo->row->dtype.bits);
    coo->row_sorted = true;
  }
132
133
}

134
135
template void COOSort_<kDGLCUDA, int32_t>(COOMatrix* coo, bool sort_column);
template void COOSort_<kDGLCUDA, int64_t>(COOMatrix* coo, bool sort_column);
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157

///////////////////////////// COOIsSorted /////////////////////////////

template <typename IdType>
__global__ void _COOIsSortedKernel(
    const IdType* row, const IdType* col,
    int64_t nnz, int8_t* row_sorted, int8_t* col_sorted) {
  int tx = blockIdx.x * blockDim.x + threadIdx.x;
  const int stride_x = gridDim.x * blockDim.x;
  while (tx < nnz) {
    if (tx == 0) {
      row_sorted[0] = 1;
      col_sorted[0] = 1;
    } else {
      row_sorted[tx] = static_cast<int8_t>(row[tx - 1] <= row[tx]);
      col_sorted[tx] = static_cast<int8_t>(
          row[tx - 1] < row[tx] || col[tx - 1] <= col[tx]);
    }
    tx += stride_x;
  }
}

158
template <DGLDeviceType XPU, typename IdType>
159
160
161
std::pair<bool, bool> COOIsSorted(COOMatrix coo) {
  const int64_t nnz = coo.row->shape[0];
  const auto& ctx = coo.row->ctx;
162
  cudaStream_t stream = runtime::getCurrentCUDAStream();
163
164
165
166
167
168
169
  auto device = runtime::DeviceAPI::Get(ctx);
  // We allocate a workspace of 2*nnz bytes. It wastes a little bit memory but should
  // be fine.
  int8_t* row_flags = static_cast<int8_t*>(device->AllocWorkspace(ctx, nnz));
  int8_t* col_flags = static_cast<int8_t*>(device->AllocWorkspace(ctx, nnz));
  const int nt = cuda::FindNumThreads(nnz);
  const int nb = (nnz + nt - 1) / nt;
170
  CUDA_KERNEL_CALL(_COOIsSortedKernel, nb, nt, 0, stream,
171
172
173
174
175
176
177
178
179
180
181
182
      coo.row.Ptr<IdType>(), coo.col.Ptr<IdType>(),
      nnz, row_flags, col_flags);

  const bool row_sorted = cuda::AllTrue(row_flags, nnz, ctx);
  const bool col_sorted = row_sorted? cuda::AllTrue(col_flags, nnz, ctx) : false;

  device->FreeWorkspace(ctx, row_flags);
  device->FreeWorkspace(ctx, col_flags);

  return {row_sorted, col_sorted};
}

183
184
template std::pair<bool, bool> COOIsSorted<kDGLCUDA, int32_t>(COOMatrix coo);
template std::pair<bool, bool> COOIsSorted<kDGLCUDA, int64_t>(COOMatrix coo);
185
186
187
188

}  // namespace impl
}  // namespace aten
}  // namespace dgl