unique_and_compact_impl.hip 7.41 KB
Newer Older
sangwzh's avatar
sangwzh committed
1
// !!! This is a file automatically generated by hipify!!!
2
3
4
5
6
7
/**
 *  Copyright (c) 2023 by Contributors
 *  Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
 * @file cuda/unique_and_compact_impl.cu
 * @brief Unique and compact operator implementation on CUDA.
 */
sangwzh's avatar
sangwzh committed
8
#include <hip/hip_runtime.h>
9
10
11
12
13
14
#include <graphbolt/cuda_ops.h>
#include <thrust/binary_search.h>
#include <thrust/functional.h>
#include <thrust/gather.h>
#include <thrust/logical.h>

sangwzh's avatar
sangwzh committed
15
#include <hipcub/hipcub.hpp>
16
#include <type_traits>
17

sangwzh's avatar
sangwzh committed
18
19
#include "common.h"
#include "utils.h"
20
21
22
23
24
25
26
27
28
29
30
31
32
33

namespace graphbolt {
namespace ops {

template <typename scalar_t>
struct EqualityFunc {
  const scalar_t* sorted_order;
  const scalar_t* found_locations;
  const scalar_t* searched_items;
  __host__ __device__ auto operator()(int64_t i) {
    return sorted_order[found_locations[i]] == searched_items[i];
  }
};

34
35
36
37
38
39
40
#define DefineCubReductionFunction(cub_reduce_fn, name)           \
  template <typename scalar_iterator_t>                           \
  auto name(const scalar_iterator_t input, int64_t size) {        \
    using scalar_t = std::remove_reference_t<decltype(input[0])>; \
    cuda::CopyScalar<scalar_t> result;                            \
    CUB_CALL(cub_reduce_fn, input, result.get(), size);           \
    return result;                                                \
41
42
  }

43
44
DefineCubReductionFunction(DeviceReduce::Max, Max);
DefineCubReductionFunction(DeviceReduce::Min, Min);
45

46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> UniqueAndCompact(
    const torch::Tensor src_ids, const torch::Tensor dst_ids,
    const torch::Tensor unique_dst_ids, int num_bits) {
  TORCH_CHECK(
      src_ids.scalar_type() == dst_ids.scalar_type() &&
          dst_ids.scalar_type() == unique_dst_ids.scalar_type(),
      "Dtypes of tensors passed to UniqueAndCompact need to be identical.");
  auto allocator = cuda::GetAllocator();
  auto stream = cuda::GetCurrentStream();
  return AT_DISPATCH_INTEGRAL_TYPES(
      src_ids.scalar_type(), "unique_and_compact", ([&] {
        auto src_ids_ptr = src_ids.data_ptr<scalar_t>();
        auto dst_ids_ptr = dst_ids.data_ptr<scalar_t>();
        auto unique_dst_ids_ptr = unique_dst_ids.data_ptr<scalar_t>();

61
62
63
64
65
66
67
        // If num_bits is not given, compute maximum vertex ids to compute
        // num_bits later to speedup the expensive sort operations.
        cuda::CopyScalar<scalar_t> max_id_src;
        cuda::CopyScalar<scalar_t> max_id_dst;
        if (num_bits == 0) {
          max_id_src = Max(src_ids_ptr, src_ids.size(0));
          max_id_dst = Max(unique_dst_ids_ptr, unique_dst_ids.size(0));
68
69
70
71
        }

        // Sort the unique_dst_ids tensor.
        auto sorted_unique_dst_ids =
72
73
74
            Sort<false>(unique_dst_ids_ptr, unique_dst_ids.size(0), num_bits);
        auto sorted_unique_dst_ids_ptr =
            sorted_unique_dst_ids.data_ptr<scalar_t>();
75
76
77

        // Mark dst nodes in the src_ids tensor.
        auto is_dst = allocator.AllocateStorage<bool>(src_ids.size(0));
78
79
        THRUST_CALL(
            binary_search, sorted_unique_dst_ids_ptr,
80
            sorted_unique_dst_ids_ptr + unique_dst_ids.size(0), src_ids_ptr,
81
82
83
            src_ids_ptr + src_ids.size(0), is_dst.get());

        // Filter the non-dst nodes in the src_ids tensor, hence only_src.
84
85
86
        auto only_src =
            torch::empty(src_ids.size(0), sorted_unique_dst_ids.options());
        {
87
88
89
          auto is_src = thrust::make_transform_iterator(
              is_dst.get(), thrust::logical_not<bool>{});
          cuda::CopyScalar<int64_t> only_src_size;
90
91
          CUB_CALL(
              DeviceSelect::Flagged, src_ids_ptr, is_src,
92
              only_src.data_ptr<scalar_t>(), only_src_size.get(),
93
              src_ids.size(0));
94
95
          stream.synchronize();
          only_src = only_src.slice(0, 0, static_cast<int64_t>(only_src_size));
96
97
        }

98
99
100
101
        // The code block above synchronizes, ensuring safe access to max_id_src
        // and max_id_dst.
        if (num_bits == 0) {
          num_bits = cuda::NumberOfBits(
sangwzh's avatar
sangwzh committed
102
              1 + ::max(
103
104
105
106
107
                      static_cast<scalar_t>(max_id_src),
                      static_cast<scalar_t>(max_id_dst)));
        }

        // Sort the only_src tensor so that we can unique it later.
108
109
110
111
112
        auto sorted_only_src = Sort<false>(
            only_src.data_ptr<scalar_t>(), only_src.size(0), num_bits);

        auto unique_only_src =
            torch::empty(only_src.size(0), src_ids.options());
113
114
115
        auto unique_only_src_ptr = unique_only_src.data_ptr<scalar_t>();

        {  // Compute the unique operation on the only_src tensor.
116
          cuda::CopyScalar<int64_t> unique_only_src_size;
117
118
119
120
          CUB_CALL(
              DeviceSelect::Unique, sorted_only_src.data_ptr<scalar_t>(),
              unique_only_src_ptr, unique_only_src_size.get(),
              only_src.size(0));
121
          stream.synchronize();
122
          unique_only_src = unique_only_src.slice(
123
              0, 0, static_cast<int64_t>(unique_only_src_size));
124
125
126
127
        }

        auto real_order = torch::cat({unique_dst_ids, unique_only_src});
        // Sort here so that binary search can be used to lookup new_ids.
128
129
        torch::Tensor sorted_order, new_ids;
        std::tie(sorted_order, new_ids) = Sort(real_order, num_bits);
130
131
132
133
134
135
136
        auto sorted_order_ptr = sorted_order.data_ptr<scalar_t>();
        auto new_ids_ptr = new_ids.data_ptr<int64_t>();
        // Holds the found locations of the src and dst ids in the sorted_order.
        // Later is used to lookup the new ids of the src_ids and dst_ids
        // tensors.
        auto new_dst_ids_loc =
            allocator.AllocateStorage<scalar_t>(dst_ids.size(0));
137
138
        THRUST_CALL(
            lower_bound, sorted_order_ptr,
139
140
            sorted_order_ptr + sorted_order.size(0), dst_ids_ptr,
            dst_ids_ptr + dst_ids.size(0), new_dst_ids_loc.get());
141
142
143
144

        cuda::CopyScalar<bool> all_exist;
        // Check if unique_dst_ids includes all dst_ids.
        if (dst_ids.size(0) > 0) {
145
146
147
148
          thrust::counting_iterator<int64_t> iota(0);
          auto equal_it = thrust::make_transform_iterator(
              iota, EqualityFunc<scalar_t>{
                        sorted_order_ptr, new_dst_ids_loc.get(), dst_ids_ptr});
149
150
          all_exist = Min(equal_it, dst_ids.size(0));
          all_exist.record();
151
152
        }

153
154
        auto new_src_ids_loc =
            allocator.AllocateStorage<scalar_t>(src_ids.size(0));
155
156
        THRUST_CALL(
            lower_bound, sorted_order_ptr,
157
158
159
            sorted_order_ptr + sorted_order.size(0), src_ids_ptr,
            src_ids_ptr + src_ids.size(0), new_src_ids_loc.get());

160
161
162
        // Finally, lookup the new compact ids of the src and dst tensors via
        // gather operations.
        auto new_src_ids = torch::empty_like(src_ids);
163
164
        THRUST_CALL(
            gather, new_src_ids_loc.get(),
165
166
            new_src_ids_loc.get() + src_ids.size(0),
            new_ids.data_ptr<int64_t>(), new_src_ids.data_ptr<scalar_t>());
167
168
169
170
171
        // Perform check before we gather for the dst indices.
        if (dst_ids.size(0) > 0 && !static_cast<bool>(all_exist)) {
          throw std::out_of_range("Some ids not found.");
        }
        auto new_dst_ids = torch::empty_like(dst_ids);
172
173
        THRUST_CALL(
            gather, new_dst_ids_loc.get(),
174
175
176
177
178
179
180
181
            new_dst_ids_loc.get() + dst_ids.size(0),
            new_ids.data_ptr<int64_t>(), new_dst_ids.data_ptr<scalar_t>());
        return std::make_tuple(real_order, new_src_ids, new_dst_ids);
      }));
}

}  // namespace ops
}  // namespace graphbolt