"vscode:/vscode.git/clone" did not exist on "6450c1228c8700c3d1f8cd3467262e878bf4987d"
cuda_to_block.cu 8.38 KB
Newer Older
1
/**
2
3
4
5
6
7
8
9
10
11
12
13
14
15
 *  Copyright 2020-2021 Contributors
 *
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 *
16
17
 * @file graph/transform/cuda/cuda_to_block.cu
 * @brief Functions to convert a set of edges into a graph block with local
18
 * ids.
19
20
 *
 * Tested via python wrapper: python/dgl/path/to/to_block.py
21
22
23
 */

#include <cuda_runtime.h>
24
25
26
#include <dgl/immutable_graph.h>
#include <dgl/runtime/device_api.h>

27
28
#include <algorithm>
#include <memory>
29
#include <utility>
30
31
32

#include "../../../runtime/cuda/cuda_common.h"
#include "../../heterograph.h"
33
#include "../to_block.h"
34
#include "cuda_map_edges.cuh"
35
36
37

using namespace dgl::aten;
using namespace dgl::runtime::cuda;
38
using namespace dgl::transform::cuda;
39
40
41
42
43
44

namespace dgl {
namespace transform {

namespace {

45
template <typename IdType>
46
47
class DeviceNodeMapMaker {
 public:
48
49
50
51
  explicit DeviceNodeMapMaker(const std::vector<int64_t>& maxNodesPerType)
      : max_num_nodes_(0) {
    max_num_nodes_ =
        *std::max_element(maxNodesPerType.begin(), maxNodesPerType.end());
52
53
54
  }

  /**
55
   * @brief This function builds node maps for each node type, preserving the
56
57
58
   * order of the input nodes. Here it is assumed the lhs_nodes are not unique,
   * and thus a unique list is generated.
   *
59
60
61
62
63
64
   * @param lhs_nodes The set of source input nodes.
   * @param rhs_nodes The set of destination input nodes.
   * @param node_maps The node maps to be constructed.
   * @param count_lhs_device The number of unique source nodes (on the GPU).
   * @param lhs_device The unique source nodes (on the GPU).
   * @param stream The stream to operate on.
65
   */
66
67
68
  void Make(
      const std::vector<IdArray>& lhs_nodes,
      const std::vector<IdArray>& rhs_nodes,
69
70
      DeviceNodeMap<IdType>* const node_maps, int64_t* const count_lhs_device,
      std::vector<IdArray>* const lhs_device, cudaStream_t stream) {
71
72
73
    const int64_t num_ntypes = lhs_nodes.size() + rhs_nodes.size();

    CUDA_CALL(cudaMemsetAsync(
74
        count_lhs_device, 0, num_ntypes * sizeof(*count_lhs_device), stream));
75
76
77
78
79
80

    // possibly dublicate lhs nodes
    const int64_t lhs_num_ntypes = static_cast<int64_t>(lhs_nodes.size());
    for (int64_t ntype = 0; ntype < lhs_num_ntypes; ++ntype) {
      const IdArray& nodes = lhs_nodes[ntype];
      if (nodes->shape[0] > 0) {
81
        CHECK_EQ(nodes->ctx.device_type, kDGLCUDA);
82
        node_maps->LhsHashTable(ntype).FillWithDuplicates(
83
84
            nodes.Ptr<IdType>(), nodes->shape[0],
            (*lhs_device)[ntype].Ptr<IdType>(), count_lhs_device + ntype,
85
86
87
88
89
90
91
92
93
94
            stream);
      }
    }

    // unique rhs nodes
    const int64_t rhs_num_ntypes = static_cast<int64_t>(rhs_nodes.size());
    for (int64_t ntype = 0; ntype < rhs_num_ntypes; ++ntype) {
      const IdArray& nodes = rhs_nodes[ntype];
      if (nodes->shape[0] > 0) {
        node_maps->RhsHashTable(ntype).FillWithUnique(
95
            nodes.Ptr<IdType>(), nodes->shape[0], stream);
96
97
98
99
      }
    }
  }

100
  /**
101
   * @brief This function builds node maps for each node type, preserving the
102
103
104
   * order of the input nodes. Here it is assumed both lhs_nodes and rhs_nodes
   * are unique.
   *
105
106
107
108
   * @param lhs_nodes The set of source input nodes.
   * @param rhs_nodes The set of destination input nodes.
   * @param node_maps The node maps to be constructed.
   * @param stream The stream to operate on.
109
   */
110
111
112
  void Make(
      const std::vector<IdArray>& lhs_nodes,
      const std::vector<IdArray>& rhs_nodes,
113
      DeviceNodeMap<IdType>* const node_maps, cudaStream_t stream) {
114
115
116
117
118
119
120
    const int64_t num_ntypes = lhs_nodes.size() + rhs_nodes.size();

    // unique lhs nodes
    const int64_t lhs_num_ntypes = static_cast<int64_t>(lhs_nodes.size());
    for (int64_t ntype = 0; ntype < lhs_num_ntypes; ++ntype) {
      const IdArray& nodes = lhs_nodes[ntype];
      if (nodes->shape[0] > 0) {
121
        CHECK_EQ(nodes->ctx.device_type, kDGLCUDA);
122
        node_maps->LhsHashTable(ntype).FillWithUnique(
123
            nodes.Ptr<IdType>(), nodes->shape[0], stream);
124
125
126
127
128
129
130
131
132
      }
    }

    // unique rhs nodes
    const int64_t rhs_num_ntypes = static_cast<int64_t>(rhs_nodes.size());
    for (int64_t ntype = 0; ntype < rhs_num_ntypes; ++ntype) {
      const IdArray& nodes = rhs_nodes[ntype];
      if (nodes->shape[0] > 0) {
        node_maps->RhsHashTable(ntype).FillWithUnique(
133
            nodes.Ptr<IdType>(), nodes->shape[0], stream);
134
135
136
137
      }
    }
  }

138
139
140
141
 private:
  IdType max_num_nodes_;
};

142
template <typename IdType>
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
struct CUDAIdsMapper {
  std::tuple<std::vector<IdArray>, std::vector<IdArray>> operator()(
      const HeteroGraphPtr& graph, bool include_rhs_in_lhs, int64_t num_ntypes,
      const DGLContext& ctx, const std::vector<int64_t>& maxNodesPerType,
      const std::vector<EdgeArray>& edge_arrays,
      const std::vector<IdArray>& src_nodes,
      const std::vector<IdArray>& rhs_nodes,
      std::vector<IdArray>* const lhs_nodes_ptr,
      std::vector<int64_t>* const num_nodes_per_type_ptr) {
    std::vector<IdArray>& lhs_nodes = *lhs_nodes_ptr;
    std::vector<int64_t>& num_nodes_per_type = *num_nodes_per_type_ptr;
    const bool generate_lhs_nodes = lhs_nodes.empty();
    auto device = runtime::DeviceAPI::Get(ctx);
    cudaStream_t stream = runtime::getCurrentCUDAStream();

    // Allocate space for map creation process.
    DeviceNodeMapMaker<IdType> maker(maxNodesPerType);
    DeviceNodeMap<IdType> node_maps(maxNodesPerType, num_ntypes, ctx, stream);
161
    if (generate_lhs_nodes) {
162
163
164
165
      lhs_nodes.reserve(num_ntypes);
      for (int64_t ntype = 0; ntype < num_ntypes; ++ntype) {
        lhs_nodes.emplace_back(
            NewIdArray(maxNodesPerType[ntype], ctx, sizeof(IdType) * 8));
166
167
      }
    }
168
169
170
171
    // Populate the mappings.
    if (generate_lhs_nodes) {
      int64_t* count_lhs_device = static_cast<int64_t*>(
          device->AllocWorkspace(ctx, sizeof(int64_t) * num_ntypes * 2));
172

173
174
175
      maker.Make(
          src_nodes, rhs_nodes, &node_maps, count_lhs_device, &lhs_nodes,
          stream);
176

177
178
179
180
181
      device->CopyDataFromTo(
          count_lhs_device, 0, num_nodes_per_type.data(), 0,
          sizeof(*num_nodes_per_type.data()) * num_ntypes, ctx,
          DGLContext{kDGLCPU, 0}, DGLDataType{kDGLInt, 64, 1});
      device->StreamSync(ctx, stream);
182

183
184
      // Wait for the node counts to finish transferring.
      device->FreeWorkspace(ctx, count_lhs_device);
185
    } else {
186
      maker.Make(lhs_nodes, rhs_nodes, &node_maps, stream);
187

188
189
190
      for (int64_t ntype = 0; ntype < num_ntypes; ++ntype) {
        num_nodes_per_type[ntype] = lhs_nodes[ntype]->shape[0];
      }
191
    }
192
193
194
195
196
    // Resize lhs nodes.
    if (generate_lhs_nodes) {
      for (int64_t ntype = 0; ntype < num_ntypes; ++ntype) {
        lhs_nodes[ntype]->shape[0] = num_nodes_per_type[ntype];
      }
197
    }
198
199
    // Map node numberings from global to local, and build pointer for CSR.
    return MapEdges(graph, edge_arrays, node_maps, stream);
200
  }
201
};
202

203
204
205
206
207
208
209
template <typename IdType>
std::tuple<HeteroGraphPtr, std::vector<IdArray>> ToBlockGPU(
    HeteroGraphPtr graph, const std::vector<IdArray>& rhs_nodes,
    bool include_rhs_in_lhs, std::vector<IdArray>* const lhs_nodes_ptr) {
  return dgl::transform::ProcessToBlock<IdType>(
      graph, rhs_nodes, include_rhs_in_lhs, lhs_nodes_ptr,
      CUDAIdsMapper<IdType>());
210
211
212
213
}

}  // namespace

214
215
216
// Use explicit names to get around MSVC's broken mangling that thinks the
// following two functions are the same. Using template<> fails to export the
// symbols.
217
std::tuple<HeteroGraphPtr, std::vector<IdArray>>
218
// ToBlock<kDGLCUDA, int32_t>
219
ToBlockGPU32(
220
221
    HeteroGraphPtr graph, const std::vector<IdArray>& rhs_nodes,
    bool include_rhs_in_lhs, std::vector<IdArray>* const lhs_nodes) {
222
  return ToBlockGPU<int32_t>(graph, rhs_nodes, include_rhs_in_lhs, lhs_nodes);
223
224
}

225
std::tuple<HeteroGraphPtr, std::vector<IdArray>>
226
// ToBlock<kDGLCUDA, int64_t>
227
ToBlockGPU64(
228
229
    HeteroGraphPtr graph, const std::vector<IdArray>& rhs_nodes,
    bool include_rhs_in_lhs, std::vector<IdArray>* const lhs_nodes) {
230
  return ToBlockGPU<int64_t>(graph, rhs_nodes, include_rhs_in_lhs, lhs_nodes);
231
232
233
234
}

}  // namespace transform
}  // namespace dgl