cuda_to_block.hip 9.24 KB
Newer Older
sangwzh's avatar
sangwzh committed
1
// !!! This is a file automatically generated by hipify!!!
2
/**
3
4
5
6
7
8
9
10
11
12
13
14
15
16
 *  Copyright 2020-2021 Contributors
 *
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 *
17
18
 * @file graph/transform/cuda/cuda_to_block.cu
 * @brief Functions to convert a set of edges into a graph block with local
19
 * ids.
20
21
 *
 * Tested via python wrapper: python/dgl/path/to/to_block.py
22
23
 */

sangwzh's avatar
sangwzh committed
24
#include <hip/hip_runtime.h>
25
26
#include <dgl/immutable_graph.h>
#include <dgl/runtime/device_api.h>
27
#include <dgl/runtime/tensordispatch.h>
28

29
30
#include <algorithm>
#include <memory>
31
#include <utility>
32
33
34

#include "../../../runtime/cuda/cuda_common.h"
#include "../../heterograph.h"
35
#include "../to_block.h"
36
#include "cuda_map_edges.cuh"
37
38
39

using namespace dgl::aten;
using namespace dgl::runtime::cuda;
40
using namespace dgl::transform::cuda;
41
using TensorDispatcher = dgl::runtime::TensorDispatcher;
42
43
44
45
46
47

namespace dgl {
namespace transform {

namespace {

48
template <typename IdType>
49
50
class DeviceNodeMapMaker {
 public:
51
52
53
54
  explicit DeviceNodeMapMaker(const std::vector<int64_t>& maxNodesPerType)
      : max_num_nodes_(0) {
    max_num_nodes_ =
        *std::max_element(maxNodesPerType.begin(), maxNodesPerType.end());
55
56
57
  }

  /**
58
   * @brief This function builds node maps for each node type, preserving the
59
60
61
   * order of the input nodes. Here it is assumed the lhs_nodes are not unique,
   * and thus a unique list is generated.
   *
62
63
64
65
66
67
   * @param lhs_nodes The set of source input nodes.
   * @param rhs_nodes The set of destination input nodes.
   * @param node_maps The node maps to be constructed.
   * @param count_lhs_device The number of unique source nodes (on the GPU).
   * @param lhs_device The unique source nodes (on the GPU).
   * @param stream The stream to operate on.
68
   */
69
70
71
  void Make(
      const std::vector<IdArray>& lhs_nodes,
      const std::vector<IdArray>& rhs_nodes,
72
      DeviceNodeMap<IdType>* const node_maps, int64_t* const count_lhs_device,
sangwzh's avatar
sangwzh committed
73
      std::vector<IdArray>* const lhs_device, hipStream_t stream) {
74
75
    const int64_t num_ntypes = lhs_nodes.size() + rhs_nodes.size();

sangwzh's avatar
sangwzh committed
76
    CUDA_CALL(hipMemsetAsync(
77
        count_lhs_device, 0, num_ntypes * sizeof(*count_lhs_device), stream));
78
79
80
81
82
83

    // possibly dublicate lhs nodes
    const int64_t lhs_num_ntypes = static_cast<int64_t>(lhs_nodes.size());
    for (int64_t ntype = 0; ntype < lhs_num_ntypes; ++ntype) {
      const IdArray& nodes = lhs_nodes[ntype];
      if (nodes->shape[0] > 0) {
84
        CHECK_EQ(nodes->ctx.device_type, kDGLCUDA);
85
        node_maps->LhsHashTable(ntype).FillWithDuplicates(
86
87
            nodes.Ptr<IdType>(), nodes->shape[0],
            (*lhs_device)[ntype].Ptr<IdType>(), count_lhs_device + ntype,
88
89
90
91
92
93
94
95
96
97
            stream);
      }
    }

    // unique rhs nodes
    const int64_t rhs_num_ntypes = static_cast<int64_t>(rhs_nodes.size());
    for (int64_t ntype = 0; ntype < rhs_num_ntypes; ++ntype) {
      const IdArray& nodes = rhs_nodes[ntype];
      if (nodes->shape[0] > 0) {
        node_maps->RhsHashTable(ntype).FillWithUnique(
98
            nodes.Ptr<IdType>(), nodes->shape[0], stream);
99
100
101
102
      }
    }
  }

103
  /**
104
   * @brief This function builds node maps for each node type, preserving the
105
106
107
   * order of the input nodes. Here it is assumed both lhs_nodes and rhs_nodes
   * are unique.
   *
108
109
110
111
   * @param lhs_nodes The set of source input nodes.
   * @param rhs_nodes The set of destination input nodes.
   * @param node_maps The node maps to be constructed.
   * @param stream The stream to operate on.
112
   */
113
114
115
  void Make(
      const std::vector<IdArray>& lhs_nodes,
      const std::vector<IdArray>& rhs_nodes,
sangwzh's avatar
sangwzh committed
116
      DeviceNodeMap<IdType>* const node_maps, hipStream_t stream) {
117
118
119
120
121
122
123
    const int64_t num_ntypes = lhs_nodes.size() + rhs_nodes.size();

    // unique lhs nodes
    const int64_t lhs_num_ntypes = static_cast<int64_t>(lhs_nodes.size());
    for (int64_t ntype = 0; ntype < lhs_num_ntypes; ++ntype) {
      const IdArray& nodes = lhs_nodes[ntype];
      if (nodes->shape[0] > 0) {
124
        CHECK_EQ(nodes->ctx.device_type, kDGLCUDA);
125
        node_maps->LhsHashTable(ntype).FillWithUnique(
126
            nodes.Ptr<IdType>(), nodes->shape[0], stream);
127
128
129
130
131
132
133
134
135
      }
    }

    // unique rhs nodes
    const int64_t rhs_num_ntypes = static_cast<int64_t>(rhs_nodes.size());
    for (int64_t ntype = 0; ntype < rhs_num_ntypes; ++ntype) {
      const IdArray& nodes = rhs_nodes[ntype];
      if (nodes->shape[0] > 0) {
        node_maps->RhsHashTable(ntype).FillWithUnique(
136
            nodes.Ptr<IdType>(), nodes->shape[0], stream);
137
138
139
140
      }
    }
  }

141
142
143
144
 private:
  IdType max_num_nodes_;
};

145
template <typename IdType>
146
147
148
149
150
151
152
153
154
155
156
157
158
struct CUDAIdsMapper {
  std::tuple<std::vector<IdArray>, std::vector<IdArray>> operator()(
      const HeteroGraphPtr& graph, bool include_rhs_in_lhs, int64_t num_ntypes,
      const DGLContext& ctx, const std::vector<int64_t>& maxNodesPerType,
      const std::vector<EdgeArray>& edge_arrays,
      const std::vector<IdArray>& src_nodes,
      const std::vector<IdArray>& rhs_nodes,
      std::vector<IdArray>* const lhs_nodes_ptr,
      std::vector<int64_t>* const num_nodes_per_type_ptr) {
    std::vector<IdArray>& lhs_nodes = *lhs_nodes_ptr;
    std::vector<int64_t>& num_nodes_per_type = *num_nodes_per_type_ptr;
    const bool generate_lhs_nodes = lhs_nodes.empty();
    auto device = runtime::DeviceAPI::Get(ctx);
sangwzh's avatar
sangwzh committed
159
    hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA();
160
161
162
163

    // Allocate space for map creation process.
    DeviceNodeMapMaker<IdType> maker(maxNodesPerType);
    DeviceNodeMap<IdType> node_maps(maxNodesPerType, num_ntypes, ctx, stream);
164
    if (generate_lhs_nodes) {
165
166
167
168
      lhs_nodes.reserve(num_ntypes);
      for (int64_t ntype = 0; ntype < num_ntypes; ++ntype) {
        lhs_nodes.emplace_back(
            NewIdArray(maxNodesPerType[ntype], ctx, sizeof(IdType) * 8));
169
170
      }
    }
171

sangwzh's avatar
sangwzh committed
172
    hipEvent_t copyEvent;
173
    NDArray new_len_tensor;
174
175
176
177
    // Populate the mappings.
    if (generate_lhs_nodes) {
      int64_t* count_lhs_device = static_cast<int64_t*>(
          device->AllocWorkspace(ctx, sizeof(int64_t) * num_ntypes * 2));
178

179
180
181
      maker.Make(
          src_nodes, rhs_nodes, &node_maps, count_lhs_device, &lhs_nodes,
          stream);
182

sangwzh's avatar
sangwzh committed
183
      CUDA_CALL(hipEventCreate(&copyEvent));
184
185
186
187
188
189
190
191
192
193
      if (TensorDispatcher::Global()->IsAvailable()) {
        new_len_tensor = NDArray::PinnedEmpty(
            {num_ntypes}, DGLDataTypeTraits<int64_t>::dtype,
            DGLContext{kDGLCPU, 0});
      } else {
        // use pageable memory, it will unecessarily block but be functional
        new_len_tensor = NDArray::Empty(
            {num_ntypes}, DGLDataTypeTraits<int64_t>::dtype,
            DGLContext{kDGLCPU, 0});
      }
sangwzh's avatar
sangwzh committed
194
      CUDA_CALL(hipMemcpyAsync(
195
196
          new_len_tensor->data, count_lhs_device,
          sizeof(*num_nodes_per_type.data()) * num_ntypes,
sangwzh's avatar
sangwzh committed
197
198
          hipMemcpyDeviceToHost, stream));
      CUDA_CALL(hipEventRecord(copyEvent, stream));
199

200
      device->FreeWorkspace(ctx, count_lhs_device);
201
    } else {
202
      maker.Make(lhs_nodes, rhs_nodes, &node_maps, stream);
203

204
205
206
      for (int64_t ntype = 0; ntype < num_ntypes; ++ntype) {
        num_nodes_per_type[ntype] = lhs_nodes[ntype]->shape[0];
      }
207
    }
208
209
210
    // Map node numberings from global to local, and build pointer for CSR.
    auto ret = MapEdges(graph, edge_arrays, node_maps, stream);

211
    if (generate_lhs_nodes) {
212
      // wait for the previous copy
sangwzh's avatar
sangwzh committed
213
214
      CUDA_CALL(hipEventSynchronize(copyEvent));
      CUDA_CALL(hipEventDestroy(copyEvent));
215
216

      // Resize lhs nodes.
217
      for (int64_t ntype = 0; ntype < num_ntypes; ++ntype) {
218
219
        num_nodes_per_type[ntype] =
            static_cast<int64_t*>(new_len_tensor->data)[ntype];
220
221
        lhs_nodes[ntype]->shape[0] = num_nodes_per_type[ntype];
      }
222
    }
223
224

    return ret;
225
  }
226
};
227

228
229
230
231
232
233
234
template <typename IdType>
std::tuple<HeteroGraphPtr, std::vector<IdArray>> ToBlockGPU(
    HeteroGraphPtr graph, const std::vector<IdArray>& rhs_nodes,
    bool include_rhs_in_lhs, std::vector<IdArray>* const lhs_nodes_ptr) {
  return dgl::transform::ProcessToBlock<IdType>(
      graph, rhs_nodes, include_rhs_in_lhs, lhs_nodes_ptr,
      CUDAIdsMapper<IdType>());
235
236
237
238
}

}  // namespace

239
240
241
// Use explicit names to get around MSVC's broken mangling that thinks the
// following two functions are the same. Using template<> fails to export the
// symbols.
242
std::tuple<HeteroGraphPtr, std::vector<IdArray>>
243
// ToBlock<kDGLCUDA, int32_t>
244
ToBlockGPU32(
245
246
    HeteroGraphPtr graph, const std::vector<IdArray>& rhs_nodes,
    bool include_rhs_in_lhs, std::vector<IdArray>* const lhs_nodes) {
247
  return ToBlockGPU<int32_t>(graph, rhs_nodes, include_rhs_in_lhs, lhs_nodes);
248
249
}

250
std::tuple<HeteroGraphPtr, std::vector<IdArray>>
251
// ToBlock<kDGLCUDA, int64_t>
252
ToBlockGPU64(
253
254
    HeteroGraphPtr graph, const std::vector<IdArray>& rhs_nodes,
    bool include_rhs_in_lhs, std::vector<IdArray>* const lhs_nodes) {
255
  return ToBlockGPU<int64_t>(graph, rhs_nodes, include_rhs_in_lhs, lhs_nodes);
256
257
258
259
}

}  // namespace transform
}  // namespace dgl