// !!! This is a file automatically generated by hipify!!! /** * Copyright 2020-2021 Contributors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * * @file graph/transform/cuda/cuda_to_block.cu * @brief Functions to convert a set of edges into a graph block with local * ids. * * Tested via python wrapper: python/dgl/path/to/to_block.py */ #include #include #include #include #include #include #include #include "../../../runtime/cuda/cuda_common.h" #include "../../heterograph.h" #include "../to_block.h" #include "cuda_map_edges.cuh" using namespace dgl::aten; using namespace dgl::runtime::cuda; using namespace dgl::transform::cuda; using TensorDispatcher = dgl::runtime::TensorDispatcher; namespace dgl { namespace transform { namespace { template class DeviceNodeMapMaker { public: explicit DeviceNodeMapMaker(const std::vector& maxNodesPerType) : max_num_nodes_(0) { max_num_nodes_ = *std::max_element(maxNodesPerType.begin(), maxNodesPerType.end()); } /** * @brief This function builds node maps for each node type, preserving the * order of the input nodes. Here it is assumed the lhs_nodes are not unique, * and thus a unique list is generated. * * @param lhs_nodes The set of source input nodes. * @param rhs_nodes The set of destination input nodes. * @param node_maps The node maps to be constructed. * @param count_lhs_device The number of unique source nodes (on the GPU). * @param lhs_device The unique source nodes (on the GPU). * @param stream The stream to operate on. */ void Make( const std::vector& lhs_nodes, const std::vector& rhs_nodes, DeviceNodeMap* const node_maps, int64_t* const count_lhs_device, std::vector* const lhs_device, hipStream_t stream) { const int64_t num_ntypes = lhs_nodes.size() + rhs_nodes.size(); CUDA_CALL(hipMemsetAsync( count_lhs_device, 0, num_ntypes * sizeof(*count_lhs_device), stream)); // possibly dublicate lhs nodes const int64_t lhs_num_ntypes = static_cast(lhs_nodes.size()); for (int64_t ntype = 0; ntype < lhs_num_ntypes; ++ntype) { const IdArray& nodes = lhs_nodes[ntype]; if (nodes->shape[0] > 0) { CHECK_EQ(nodes->ctx.device_type, kDGLCUDA); node_maps->LhsHashTable(ntype).FillWithDuplicates( nodes.Ptr(), nodes->shape[0], (*lhs_device)[ntype].Ptr(), count_lhs_device + ntype, stream); } } // unique rhs nodes const int64_t rhs_num_ntypes = static_cast(rhs_nodes.size()); for (int64_t ntype = 0; ntype < rhs_num_ntypes; ++ntype) { const IdArray& nodes = rhs_nodes[ntype]; if (nodes->shape[0] > 0) { node_maps->RhsHashTable(ntype).FillWithUnique( nodes.Ptr(), nodes->shape[0], stream); } } } /** * @brief This function builds node maps for each node type, preserving the * order of the input nodes. Here it is assumed both lhs_nodes and rhs_nodes * are unique. * * @param lhs_nodes The set of source input nodes. * @param rhs_nodes The set of destination input nodes. * @param node_maps The node maps to be constructed. * @param stream The stream to operate on. */ void Make( const std::vector& lhs_nodes, const std::vector& rhs_nodes, DeviceNodeMap* const node_maps, hipStream_t stream) { const int64_t num_ntypes = lhs_nodes.size() + rhs_nodes.size(); // unique lhs nodes const int64_t lhs_num_ntypes = static_cast(lhs_nodes.size()); for (int64_t ntype = 0; ntype < lhs_num_ntypes; ++ntype) { const IdArray& nodes = lhs_nodes[ntype]; if (nodes->shape[0] > 0) { CHECK_EQ(nodes->ctx.device_type, kDGLCUDA); node_maps->LhsHashTable(ntype).FillWithUnique( nodes.Ptr(), nodes->shape[0], stream); } } // unique rhs nodes const int64_t rhs_num_ntypes = static_cast(rhs_nodes.size()); for (int64_t ntype = 0; ntype < rhs_num_ntypes; ++ntype) { const IdArray& nodes = rhs_nodes[ntype]; if (nodes->shape[0] > 0) { node_maps->RhsHashTable(ntype).FillWithUnique( nodes.Ptr(), nodes->shape[0], stream); } } } private: IdType max_num_nodes_; }; template struct CUDAIdsMapper { std::tuple, std::vector> operator()( const HeteroGraphPtr& graph, bool include_rhs_in_lhs, int64_t num_ntypes, const DGLContext& ctx, const std::vector& maxNodesPerType, const std::vector& edge_arrays, const std::vector& src_nodes, const std::vector& rhs_nodes, std::vector* const lhs_nodes_ptr, std::vector* const num_nodes_per_type_ptr) { std::vector& lhs_nodes = *lhs_nodes_ptr; std::vector& num_nodes_per_type = *num_nodes_per_type_ptr; const bool generate_lhs_nodes = lhs_nodes.empty(); auto device = runtime::DeviceAPI::Get(ctx); hipStream_t stream = runtime::getCurrentHIPStreamMasqueradingAsCUDA(); // Allocate space for map creation process. DeviceNodeMapMaker maker(maxNodesPerType); DeviceNodeMap node_maps(maxNodesPerType, num_ntypes, ctx, stream); if (generate_lhs_nodes) { lhs_nodes.reserve(num_ntypes); for (int64_t ntype = 0; ntype < num_ntypes; ++ntype) { lhs_nodes.emplace_back( NewIdArray(maxNodesPerType[ntype], ctx, sizeof(IdType) * 8)); } } hipEvent_t copyEvent; NDArray new_len_tensor; // Populate the mappings. if (generate_lhs_nodes) { int64_t* count_lhs_device = static_cast( device->AllocWorkspace(ctx, sizeof(int64_t) * num_ntypes * 2)); maker.Make( src_nodes, rhs_nodes, &node_maps, count_lhs_device, &lhs_nodes, stream); CUDA_CALL(hipEventCreate(©Event)); if (TensorDispatcher::Global()->IsAvailable()) { new_len_tensor = NDArray::PinnedEmpty( {num_ntypes}, DGLDataTypeTraits::dtype, DGLContext{kDGLCPU, 0}); } else { // use pageable memory, it will unecessarily block but be functional new_len_tensor = NDArray::Empty( {num_ntypes}, DGLDataTypeTraits::dtype, DGLContext{kDGLCPU, 0}); } CUDA_CALL(hipMemcpyAsync( new_len_tensor->data, count_lhs_device, sizeof(*num_nodes_per_type.data()) * num_ntypes, hipMemcpyDeviceToHost, stream)); CUDA_CALL(hipEventRecord(copyEvent, stream)); device->FreeWorkspace(ctx, count_lhs_device); } else { maker.Make(lhs_nodes, rhs_nodes, &node_maps, stream); for (int64_t ntype = 0; ntype < num_ntypes; ++ntype) { num_nodes_per_type[ntype] = lhs_nodes[ntype]->shape[0]; } } // Map node numberings from global to local, and build pointer for CSR. auto ret = MapEdges(graph, edge_arrays, node_maps, stream); if (generate_lhs_nodes) { // wait for the previous copy CUDA_CALL(hipEventSynchronize(copyEvent)); CUDA_CALL(hipEventDestroy(copyEvent)); // Resize lhs nodes. for (int64_t ntype = 0; ntype < num_ntypes; ++ntype) { num_nodes_per_type[ntype] = static_cast(new_len_tensor->data)[ntype]; lhs_nodes[ntype]->shape[0] = num_nodes_per_type[ntype]; } } return ret; } }; template std::tuple> ToBlockGPU( HeteroGraphPtr graph, const std::vector& rhs_nodes, bool include_rhs_in_lhs, std::vector* const lhs_nodes_ptr) { return dgl::transform::ProcessToBlock( graph, rhs_nodes, include_rhs_in_lhs, lhs_nodes_ptr, CUDAIdsMapper()); } } // namespace // Use explicit names to get around MSVC's broken mangling that thinks the // following two functions are the same. Using template<> fails to export the // symbols. std::tuple> // ToBlock ToBlockGPU32( HeteroGraphPtr graph, const std::vector& rhs_nodes, bool include_rhs_in_lhs, std::vector* const lhs_nodes) { return ToBlockGPU(graph, rhs_nodes, include_rhs_in_lhs, lhs_nodes); } std::tuple> // ToBlock ToBlockGPU64( HeteroGraphPtr graph, const std::vector& rhs_nodes, bool include_rhs_in_lhs, std::vector* const lhs_nodes) { return ToBlockGPU(graph, rhs_nodes, include_rhs_in_lhs, lhs_nodes); } } // namespace transform } // namespace dgl