array_nonzero.hip 2.14 KB
Newer Older
sangwzh's avatar
sangwzh committed
1
2
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
3
/**
4
 *  Copyright (c) 2020 by Contributors
5
6
 * @file array/cpu/array_nonzero.cc
 * @brief Array nonzero CPU implementation
7
 */
8

9
#include <dgl/array.h>
sangwzh's avatar
sangwzh committed
10
#include "../../../include/dgl/array.h"
11

sangwzh's avatar
sangwzh committed
12
13

#include <hipcub/hipcub.hpp>
14

15
#include "../../runtime/cuda/cuda_common.h"
sangwzh's avatar
sangwzh committed
16
#include "utils.h"
17
18
19
20
21
22
23

namespace dgl {
using runtime::NDArray;
namespace aten {
namespace impl {

template <typename IdType>
24
struct IsNonZeroIndex {
25
  explicit IsNonZeroIndex(const IdType* array) : array_(array) {}
26

27
  __device__ bool operator()(const int64_t index) { return array_[index] != 0; }
28

29
  const IdType* array_;
30
31
};

32
template <DGLDeviceType XPU, typename IdType>
33
IdArray NonZero(IdArray array) {
34
35
36
  const auto& ctx = array->ctx;
  auto device = runtime::DeviceAPI::Get(ctx);

37
  const int64_t len = array->shape[0];
38
39
  IdArray ret = NewIdArray(len, ctx, 64);

sangwzh's avatar
sangwzh committed
40
  hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
41

42
43
  const IdType* const in_data = static_cast<const IdType*>(array->data);
  int64_t* const out_data = static_cast<int64_t*>(ret->data);
44
45

  IsNonZeroIndex<IdType> comp(in_data);
sangwzh's avatar
sangwzh committed
46
  hipcub::CountingInputIterator<int64_t> counter(0);
47
48

  // room for cub to output on GPU
49
50
  int64_t* d_num_nonzeros =
      static_cast<int64_t*>(device->AllocWorkspace(ctx, sizeof(int64_t)));
51
52

  size_t temp_size = 0;
sangwzh's avatar
sangwzh committed
53
  CUDA_CALL(hipcub::DeviceSelect::If(
54
55
56
      nullptr, temp_size, counter, out_data, d_num_nonzeros, len, comp,
      stream));
  void* temp = device->AllocWorkspace(ctx, temp_size);
sangwzh's avatar
sangwzh committed
57
  CUDA_CALL(hipcub::DeviceSelect::If(
58
      temp, temp_size, counter, out_data, d_num_nonzeros, len, comp, stream));
59
60
61
  device->FreeWorkspace(ctx, temp);

  // copy number of selected elements from GPU to CPU
62
  int64_t num_nonzeros = cuda::GetCUDAScalar(device, ctx, d_num_nonzeros);
63
64
65
66
  device->FreeWorkspace(ctx, d_num_nonzeros);
  device->StreamSync(ctx, stream);

  // truncate array to size
67
68
69
  return ret.CreateView({num_nonzeros}, ret->dtype, 0);
}

70
71
template IdArray NonZero<kDGLCUDA, int32_t>(IdArray);
template IdArray NonZero<kDGLCUDA, int64_t>(IdArray);
72
73
74
75

}  // namespace impl
}  // namespace aten
}  // namespace dgl