cuda_to_block.cu 13.3 KB
Newer Older
1
/*!
2
3
4
5
6
7
8
9
10
11
12
13
14
15
 *  Copyright 2020-2021 Contributors
 *
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 *
16
 * \file graph/transform/cuda/cuda_to_block.cu
17
18
19
20
21
 * \brief Functions to convert a set of edges into a graph block with local
 * ids.
 */

#include <cuda_runtime.h>
22
23
24
#include <dgl/immutable_graph.h>
#include <dgl/runtime/device_api.h>

25
26
#include <algorithm>
#include <memory>
27
#include <utility>
28
29
30
31

#include "../../../runtime/cuda/cuda_common.h"
#include "../../heterograph.h"
#include "../to_bipartite.h"
32
#include "cuda_map_edges.cuh"
33
34
35

using namespace dgl::aten;
using namespace dgl::runtime::cuda;
36
using namespace dgl::transform::cuda;
37
38
39
40
41
42

namespace dgl {
namespace transform {

namespace {

43
template <typename IdType>
44
45
class DeviceNodeMapMaker {
 public:
46
47
48
49
  explicit DeviceNodeMapMaker(const std::vector<int64_t>& maxNodesPerType)
      : max_num_nodes_(0) {
    max_num_nodes_ =
        *std::max_element(maxNodesPerType.begin(), maxNodesPerType.end());
50
51
52
  }

  /**
53
54
55
56
57
58
59
60
61
62
63
   * \brief This function builds node maps for each node type, preserving the
   * order of the input nodes. Here it is assumed the lhs_nodes are not unique,
   * and thus a unique list is generated.
   *
   * \param lhs_nodes The set of source input nodes.
   * \param rhs_nodes The set of destination input nodes.
   * \param node_maps The node maps to be constructed.
   * \param count_lhs_device The number of unique source nodes (on the GPU).
   * \param lhs_device The unique source nodes (on the GPU).
   * \param stream The stream to operate on.
   */
64
65
66
  void Make(
      const std::vector<IdArray>& lhs_nodes,
      const std::vector<IdArray>& rhs_nodes,
67
68
      DeviceNodeMap<IdType>* const node_maps, int64_t* const count_lhs_device,
      std::vector<IdArray>* const lhs_device, cudaStream_t stream) {
69
70
71
    const int64_t num_ntypes = lhs_nodes.size() + rhs_nodes.size();

    CUDA_CALL(cudaMemsetAsync(
72
        count_lhs_device, 0, num_ntypes * sizeof(*count_lhs_device), stream));
73
74
75
76
77
78

    // possibly dublicate lhs nodes
    const int64_t lhs_num_ntypes = static_cast<int64_t>(lhs_nodes.size());
    for (int64_t ntype = 0; ntype < lhs_num_ntypes; ++ntype) {
      const IdArray& nodes = lhs_nodes[ntype];
      if (nodes->shape[0] > 0) {
79
        CHECK_EQ(nodes->ctx.device_type, kDGLCUDA);
80
        node_maps->LhsHashTable(ntype).FillWithDuplicates(
81
82
            nodes.Ptr<IdType>(), nodes->shape[0],
            (*lhs_device)[ntype].Ptr<IdType>(), count_lhs_device + ntype,
83
84
85
86
87
88
89
90
91
92
            stream);
      }
    }

    // unique rhs nodes
    const int64_t rhs_num_ntypes = static_cast<int64_t>(rhs_nodes.size());
    for (int64_t ntype = 0; ntype < rhs_num_ntypes; ++ntype) {
      const IdArray& nodes = rhs_nodes[ntype];
      if (nodes->shape[0] > 0) {
        node_maps->RhsHashTable(ntype).FillWithUnique(
93
            nodes.Ptr<IdType>(), nodes->shape[0], stream);
94
95
96
97
      }
    }
  }

98
  /**
99
100
101
102
103
104
105
106
107
   * \brief This function builds node maps for each node type, preserving the
   * order of the input nodes. Here it is assumed both lhs_nodes and rhs_nodes
   * are unique.
   *
   * \param lhs_nodes The set of source input nodes.
   * \param rhs_nodes The set of destination input nodes.
   * \param node_maps The node maps to be constructed.
   * \param stream The stream to operate on.
   */
108
109
110
  void Make(
      const std::vector<IdArray>& lhs_nodes,
      const std::vector<IdArray>& rhs_nodes,
111
      DeviceNodeMap<IdType>* const node_maps, cudaStream_t stream) {
112
113
114
115
116
117
118
    const int64_t num_ntypes = lhs_nodes.size() + rhs_nodes.size();

    // unique lhs nodes
    const int64_t lhs_num_ntypes = static_cast<int64_t>(lhs_nodes.size());
    for (int64_t ntype = 0; ntype < lhs_num_ntypes; ++ntype) {
      const IdArray& nodes = lhs_nodes[ntype];
      if (nodes->shape[0] > 0) {
119
        CHECK_EQ(nodes->ctx.device_type, kDGLCUDA);
120
        node_maps->LhsHashTable(ntype).FillWithUnique(
121
            nodes.Ptr<IdType>(), nodes->shape[0], stream);
122
123
124
125
126
127
128
129
130
      }
    }

    // unique rhs nodes
    const int64_t rhs_num_ntypes = static_cast<int64_t>(rhs_nodes.size());
    for (int64_t ntype = 0; ntype < rhs_num_ntypes; ++ntype) {
      const IdArray& nodes = rhs_nodes[ntype];
      if (nodes->shape[0] > 0) {
        node_maps->RhsHashTable(ntype).FillWithUnique(
131
            nodes.Ptr<IdType>(), nodes->shape[0], stream);
132
133
134
135
      }
    }
  }

136
137
138
139
140
 private:
  IdType max_num_nodes_;
};

// Since partial specialization is not allowed for functions, use this as an
141
// intermediate for ToBlock where XPU = kDGLCUDA.
142
143
144
145
template <typename IdType>
std::tuple<HeteroGraphPtr, std::vector<IdArray>> ToBlockGPU(
    HeteroGraphPtr graph, const std::vector<IdArray>& rhs_nodes,
    const bool include_rhs_in_lhs, std::vector<IdArray>* const lhs_nodes_ptr) {
146
147
148
  std::vector<IdArray>& lhs_nodes = *lhs_nodes_ptr;
  const bool generate_lhs_nodes = lhs_nodes.empty();

149
150
  const auto& ctx = graph->Context();
  auto device = runtime::DeviceAPI::Get(ctx);
151
  cudaStream_t stream = runtime::getCurrentCUDAStream();
152

153
  CHECK_EQ(ctx.device_type, kDGLCUDA);
154
155
156
157
158
  for (const auto& nodes : rhs_nodes) {
    CHECK_EQ(ctx.device_type, nodes->ctx.device_type);
  }

  // Since DST nodes are included in SRC nodes, a common requirement is to fetch
159
160
161
162
163
  // the DST node features from the SRC nodes features. To avoid expensive
  // sparse lookup, the function assures that the DST nodes in both SRC and DST
  // sets have the same ids. As a result, given the node feature tensor ``X`` of
  // type ``utype``, the following code finds the corresponding DST node
  // features of type ``vtype``:
164
165
166
167
168

  const int64_t num_etypes = graph->NumEdgeTypes();
  const int64_t num_ntypes = graph->NumVertexTypes();

  CHECK(rhs_nodes.size() == static_cast<size_t>(num_ntypes))
169
      << "rhs_nodes not given for every node type";
170
171
172
173
174
175
176
177
178
179
180

  std::vector<EdgeArray> edge_arrays(num_etypes);
  for (int64_t etype = 0; etype < num_etypes; ++etype) {
    const auto src_dst_types = graph->GetEndpointTypes(etype);
    const dgl_type_t dsttype = src_dst_types.second;
    if (!aten::IsNullArray(rhs_nodes[dsttype])) {
      edge_arrays[etype] = graph->Edges(etype);
    }
  }

  // count lhs and rhs nodes
181
  std::vector<int64_t> maxNodesPerType(num_ntypes * 2, 0);
182
  for (int64_t ntype = 0; ntype < num_ntypes; ++ntype) {
183
    maxNodesPerType[ntype + num_ntypes] += rhs_nodes[ntype]->shape[0];
184

185
186
187
188
189
190
    if (generate_lhs_nodes) {
      if (include_rhs_in_lhs) {
        maxNodesPerType[ntype] += rhs_nodes[ntype]->shape[0];
      }
    } else {
      maxNodesPerType[ntype] += lhs_nodes[ntype]->shape[0];
191
192
    }
  }
193
194
195
196
197
198
199
200
201
  if (generate_lhs_nodes) {
    // we don't have lhs_nodes, see we need to count inbound edges to get an
    // upper bound
    for (int64_t etype = 0; etype < num_etypes; ++etype) {
      const auto src_dst_types = graph->GetEndpointTypes(etype);
      const dgl_type_t srctype = src_dst_types.first;
      if (edge_arrays[etype].src.defined()) {
        maxNodesPerType[srctype] += edge_arrays[etype].src->shape[0];
      }
202
203
204
205
206
    }
  }

  // gather lhs_nodes
  std::vector<IdArray> src_nodes(num_ntypes);
207
208
209
  if (generate_lhs_nodes) {
    std::vector<int64_t> src_node_offsets(num_ntypes, 0);
    for (int64_t ntype = 0; ntype < num_ntypes; ++ntype) {
210
211
      src_nodes[ntype] =
          NewIdArray(maxNodesPerType[ntype], ctx, sizeof(IdType) * 8);
212
213
      if (include_rhs_in_lhs) {
        // place rhs nodes first
214
215
216
217
218
219
        device->CopyDataFromTo(
            rhs_nodes[ntype].Ptr<IdType>(), 0, src_nodes[ntype].Ptr<IdType>(),
            src_node_offsets[ntype],
            sizeof(IdType) * rhs_nodes[ntype]->shape[0], rhs_nodes[ntype]->ctx,
            src_nodes[ntype]->ctx, rhs_nodes[ntype]->dtype);
        src_node_offsets[ntype] += sizeof(IdType) * rhs_nodes[ntype]->shape[0];
220
      }
221
    }
222
223
224
225
226
227
    for (int64_t etype = 0; etype < num_etypes; ++etype) {
      const auto src_dst_types = graph->GetEndpointTypes(etype);
      const dgl_type_t srctype = src_dst_types.first;
      if (edge_arrays[etype].src.defined()) {
        device->CopyDataFromTo(
            edge_arrays[etype].src.Ptr<IdType>(), 0,
228
229
230
            src_nodes[srctype].Ptr<IdType>(), src_node_offsets[srctype],
            sizeof(IdType) * edge_arrays[etype].src->shape[0],
            rhs_nodes[srctype]->ctx, src_nodes[srctype]->ctx,
231
            rhs_nodes[srctype]->dtype);
232

233
234
        src_node_offsets[srctype] +=
            sizeof(IdType) * edge_arrays[etype].src->shape[0];
235
236
237
238
239
      }
    }
  } else {
    for (int64_t ntype = 0; ntype < num_ntypes; ++ntype) {
      src_nodes[ntype] = lhs_nodes[ntype];
240
241
242
243
244
    }
  }

  // allocate space for map creation process
  DeviceNodeMapMaker<IdType> maker(maxNodesPerType);
245
  DeviceNodeMap<IdType> node_maps(maxNodesPerType, num_ntypes, ctx, stream);
246

247
248
249
  if (generate_lhs_nodes) {
    lhs_nodes.reserve(num_ntypes);
    for (int64_t ntype = 0; ntype < num_ntypes; ++ntype) {
250
251
      lhs_nodes.emplace_back(
          NewIdArray(maxNodesPerType[ntype], ctx, sizeof(IdType) * 8));
252
    }
253
254
  }

255
  std::vector<int64_t> num_nodes_per_type(num_ntypes * 2);
256
  // populate RHS nodes from what we already know
257
  for (int64_t ntype = 0; ntype < num_ntypes; ++ntype) {
258
    num_nodes_per_type[num_ntypes + ntype] = rhs_nodes[ntype]->shape[0];
259
260
261
  }

  // populate the mappings
262
  if (generate_lhs_nodes) {
263
264
    int64_t* count_lhs_device = static_cast<int64_t*>(
        device->AllocWorkspace(ctx, sizeof(int64_t) * num_ntypes * 2));
265
266

    maker.Make(
267
        src_nodes, rhs_nodes, &node_maps, count_lhs_device, &lhs_nodes, stream);
268
269

    device->CopyDataFromTo(
270
271
272
        count_lhs_device, 0, num_nodes_per_type.data(), 0,
        sizeof(*num_nodes_per_type.data()) * num_ntypes, ctx,
        DGLContext{kDGLCPU, 0}, DGLDataType{kDGLInt, 64, 1});
273
274
275
276
277
    device->StreamSync(ctx, stream);

    // wait for the node counts to finish transferring
    device->FreeWorkspace(ctx, count_lhs_device);
  } else {
278
    maker.Make(lhs_nodes, rhs_nodes, &node_maps, stream);
279
280
281
282
283

    for (int64_t ntype = 0; ntype < num_ntypes; ++ntype) {
      num_nodes_per_type[ntype] = lhs_nodes[ntype]->shape[0];
    }
  }
284
285
286
287
288
289
290
291

  std::vector<IdArray> induced_edges;
  induced_edges.reserve(num_etypes);
  for (int64_t etype = 0; etype < num_etypes; ++etype) {
    if (edge_arrays[etype].id.defined()) {
      induced_edges.push_back(edge_arrays[etype].id);
    } else {
      induced_edges.push_back(
292
          aten::NullArray(DGLDataType{kDGLInt, sizeof(IdType) * 8, 1}, ctx));
293
294
295
296
297
298
299
    }
  }

  // build metagraph -- small enough to be done on CPU
  const auto meta_graph = graph->meta_graph();
  const EdgeArray etypes = meta_graph->Edges("eid");
  const IdArray new_dst = Add(etypes.dst, num_ntypes);
300
301
  const auto new_meta_graph =
      ImmutableGraph::CreateFromCOO(num_ntypes * 2, etypes.src, new_dst);
302
303
304
305
306
307
308
309
310
311
312

  // allocate vector for graph relations while GPU is busy
  std::vector<HeteroGraphPtr> rel_graphs;
  rel_graphs.reserve(num_etypes);

  // map node numberings from global to local, and build pointer for CSR
  std::vector<IdArray> new_lhs;
  std::vector<IdArray> new_rhs;
  std::tie(new_lhs, new_rhs) = MapEdges(graph, edge_arrays, node_maps, stream);

  // resize lhs nodes
313
314
315
316
  if (generate_lhs_nodes) {
    for (int64_t ntype = 0; ntype < num_ntypes; ++ntype) {
      lhs_nodes[ntype]->shape[0] = num_nodes_per_type[ntype];
    }
317
318
319
320
321
322
323
324
325
326
327
328
  }

  // build the heterograph
  for (int64_t etype = 0; etype < num_etypes; ++etype) {
    const auto src_dst_types = graph->GetEndpointTypes(etype);
    const dgl_type_t srctype = src_dst_types.first;
    const dgl_type_t dsttype = src_dst_types.second;

    if (rhs_nodes[dsttype]->shape[0] == 0) {
      // No rhs nodes are given for this edge type. Create an empty graph.
      rel_graphs.push_back(CreateFromCOO(
          2, lhs_nodes[srctype]->shape[0], rhs_nodes[dsttype]->shape[0],
329
330
          aten::NullArray(DGLDataType{kDGLInt, sizeof(IdType) * 8, 1}, ctx),
          aten::NullArray(DGLDataType{kDGLInt, sizeof(IdType) * 8, 1}, ctx)));
331
332
    } else {
      rel_graphs.push_back(CreateFromCOO(
333
334
          2, lhs_nodes[srctype]->shape[0], rhs_nodes[dsttype]->shape[0],
          new_lhs[etype], new_rhs[etype]));
335
336
337
    }
  }

338
339
  HeteroGraphPtr new_graph =
      CreateHeteroGraph(new_meta_graph, rel_graphs, num_nodes_per_type);
340
341

  // return the new graph, the new src nodes, and new edges
342
  return std::make_tuple(new_graph, induced_edges);
343
344
345
346
}

}  // namespace

347
348
349
// Use explicit names to get around MSVC's broken mangling that thinks the
// following two functions are the same. Using template<> fails to export the
// symbols.
350
std::tuple<HeteroGraphPtr, std::vector<IdArray>>
351
// ToBlock<kDGLCUDA, int32_t>
352
ToBlockGPU32(
353
354
    HeteroGraphPtr graph, const std::vector<IdArray>& rhs_nodes,
    bool include_rhs_in_lhs, std::vector<IdArray>* const lhs_nodes) {
355
  return ToBlockGPU<int32_t>(graph, rhs_nodes, include_rhs_in_lhs, lhs_nodes);
356
357
}

358
std::tuple<HeteroGraphPtr, std::vector<IdArray>>
359
// ToBlock<kDGLCUDA, int64_t>
360
ToBlockGPU64(
361
362
    HeteroGraphPtr graph, const std::vector<IdArray>& rhs_nodes,
    bool include_rhs_in_lhs, std::vector<IdArray>* const lhs_nodes) {
363
  return ToBlockGPU<int64_t>(graph, rhs_nodes, include_rhs_in_lhs, lhs_nodes);
364
365
366
367
}

}  // namespace transform
}  // namespace dgl