array_sort.cu 1.67 KB
Newer Older
1
2
3
4
5
6
7
8
/*!
 *  Copyright (c) 2020 by Contributors
 * \file array/cpu/array_sort.cu
 * \brief Array sort GPU implementation
 */
#include <dgl/array.h>
#include "../../runtime/cuda/cuda_common.h"
#include "./utils.h"
9
#include "./dgl_cub.cuh"
10
11
12
13
14
15
16

namespace dgl {
using runtime::NDArray;
namespace aten {
namespace impl {

template <DLDeviceType XPU, typename IdType>
17
std::pair<IdArray, IdArray> Sort(IdArray array, int num_bits) {
18
19
20
21
22
23
24
25
26
27
28
29
  const auto& ctx = array->ctx;
  auto device = runtime::DeviceAPI::Get(ctx);
  const int64_t nitems = array->shape[0];
  IdArray orig_idx = Range(0, nitems, 64, ctx);
  IdArray sorted_array = NewIdArray(nitems, ctx, array->dtype.bits);
  IdArray sorted_idx = NewIdArray(nitems, ctx, 64);

  const IdType* keys_in = array.Ptr<IdType>();
  const int64_t* values_in = orig_idx.Ptr<int64_t>();
  IdType* keys_out = sorted_array.Ptr<IdType>();
  int64_t* values_out = sorted_idx.Ptr<int64_t>();

30
31
32
33
  if (num_bits == 0) {
    num_bits = sizeof(IdType)*8;
  }

34
35
  // Allocate workspace
  size_t workspace_size = 0;
36
37
  CUDA_CALL(cub::DeviceRadixSort::SortPairs(nullptr, workspace_size,
      keys_in, keys_out, values_in, values_out, nitems, 0, num_bits));
38
39
40
  void* workspace = device->AllocWorkspace(ctx, workspace_size);

  // Compute
41
42
  CUDA_CALL(cub::DeviceRadixSort::SortPairs(workspace, workspace_size,
      keys_in, keys_out, values_in, values_out, nitems, 0, num_bits));
43
44
45
46
47
48

  device->FreeWorkspace(ctx, workspace);

  return std::make_pair(sorted_array, sorted_idx);
}

49
50
template std::pair<IdArray, IdArray> Sort<kDLGPU, int32_t>(IdArray, int num_bits);
template std::pair<IdArray, IdArray> Sort<kDLGPU, int64_t>(IdArray, int num_bits);
51
52
53
54

}  // namespace impl
}  // namespace aten
}  // namespace dgl