Unverified Commit a8c81018 authored by Xin Yao's avatar Xin Yao Committed by GitHub
Browse files

[Sampling] Implement dgl.compact_graphs() for the GPU (#3423)

* gpu compact graph template

* cuda compact graph draft

* fix typo

* compact graphs

* pass unit test but fail in training

* example using EdgeDataLoader on the GPU

* refactor cuda_compact_graph and cuda_to_block

* update training scripts

* fix linting

* fix linting

* fix exclude_edges for the GPU

* add --data-cpu & fix copyright
parent 308e52a3
...@@ -67,6 +67,9 @@ def run(proc_id, n_gpus, args, devices, data): ...@@ -67,6 +67,9 @@ def run(proc_id, n_gpus, args, devices, data):
train_mask, val_mask, test_mask, n_classes, g = data train_mask, val_mask, test_mask, n_classes, g = data
nfeat = g.ndata.pop('feat') nfeat = g.ndata.pop('feat')
labels = g.ndata.pop('label') labels = g.ndata.pop('label')
if not args.data_cpu:
nfeat = nfeat.to(device)
labels = labels.to(device)
in_feats = nfeat.shape[1] in_feats = nfeat.shape[1]
train_nid = th.LongTensor(np.nonzero(train_mask)).squeeze() train_nid = th.LongTensor(np.nonzero(train_mask)).squeeze()
...@@ -77,6 +80,11 @@ def run(proc_id, n_gpus, args, devices, data): ...@@ -77,6 +80,11 @@ def run(proc_id, n_gpus, args, devices, data):
n_edges = g.num_edges() n_edges = g.num_edges()
train_seeds = th.arange(n_edges) train_seeds = th.arange(n_edges)
if args.sample_gpu:
assert n_gpus > 0, "Must have GPUs to enable GPU sampling"
train_seeds = train_seeds.to(device)
g = g.to(device)
# Create sampler # Create sampler
sampler = dgl.dataloading.MultiLayerNeighborSampler( sampler = dgl.dataloading.MultiLayerNeighborSampler(
[int(fanout) for fanout in args.fan_out.split(',')]) [int(fanout) for fanout in args.fan_out.split(',')])
...@@ -121,11 +129,11 @@ def run(proc_id, n_gpus, args, devices, data): ...@@ -121,11 +129,11 @@ def run(proc_id, n_gpus, args, devices, data):
tic_step = time.time() tic_step = time.time()
for step, (input_nodes, pos_graph, neg_graph, blocks) in enumerate(dataloader): for step, (input_nodes, pos_graph, neg_graph, blocks) in enumerate(dataloader):
batch_inputs = nfeat[input_nodes].to(device) batch_inputs = nfeat[input_nodes].to(device)
d_step = time.time()
pos_graph = pos_graph.to(device) pos_graph = pos_graph.to(device)
neg_graph = neg_graph.to(device) neg_graph = neg_graph.to(device)
blocks = [block.int().to(device) for block in blocks] blocks = [block.int().to(device) for block in blocks]
d_step = time.time()
# Compute loss and prediction # Compute loss and prediction
batch_pred = model(blocks, batch_inputs) batch_pred = model(blocks, batch_inputs)
loss = loss_fcn(batch_pred, pos_graph, neg_graph) loss = loss_fcn(batch_pred, pos_graph, neg_graph)
...@@ -213,6 +221,13 @@ if __name__ == '__main__': ...@@ -213,6 +221,13 @@ if __name__ == '__main__':
argparser.add_argument('--dropout', type=float, default=0.5) argparser.add_argument('--dropout', type=float, default=0.5)
argparser.add_argument('--num-workers', type=int, default=0, argparser.add_argument('--num-workers', type=int, default=0,
help="Number of sampling processes. Use 0 for no extra process.") help="Number of sampling processes. Use 0 for no extra process.")
argparser.add_argument('--sample-gpu', action='store_true',
help="Perform the sampling process on the GPU. Must have 0 workers.")
argparser.add_argument('--data-cpu', action='store_true',
help="By default the script puts all node features and labels "
"on GPU when using it to save time for data copy. This may "
"be undesired if they cannot fit in GPU memory at once. "
"This flag disables that.")
args = argparser.parse_args() args = argparser.parse_args()
devices = list(map(int, args.gpu.split(','))) devices = list(map(int, args.gpu.split(',')))
......
...@@ -368,7 +368,7 @@ class BlockSampler(object): ...@@ -368,7 +368,7 @@ class BlockSampler(object):
else: else:
seed_nodes_in = seed_nodes_in.to(graph_device) seed_nodes_in = seed_nodes_in.to(graph_device)
if self.exclude_edges_in_frontier: if self.exclude_edges_in_frontier(g):
frontier = self.sample_frontier( frontier = self.sample_frontier(
block_id, g, seed_nodes_in, exclude_eids=exclude_eids) block_id, g, seed_nodes_in, exclude_eids=exclude_eids)
else: else:
......
...@@ -1913,7 +1913,7 @@ def compact_graphs(graphs, always_preserve=None, copy_ndata=True, copy_edata=Tru ...@@ -1913,7 +1913,7 @@ def compact_graphs(graphs, always_preserve=None, copy_ndata=True, copy_edata=Tru
graphs : DGLGraph or list[DGLGraph] graphs : DGLGraph or list[DGLGraph]
The graph, or list of graphs. The graph, or list of graphs.
All graphs must be on CPU. All graphs must be on the same devices.
All graphs must have the same set of nodes. All graphs must have the same set of nodes.
always_preserve : Tensor or dict[str, Tensor], optional always_preserve : Tensor or dict[str, Tensor], optional
...@@ -2013,7 +2013,6 @@ def compact_graphs(graphs, always_preserve=None, copy_ndata=True, copy_edata=Tru ...@@ -2013,7 +2013,6 @@ def compact_graphs(graphs, always_preserve=None, copy_ndata=True, copy_edata=Tru
return [] return []
if graphs[0].is_block: if graphs[0].is_block:
raise DGLError('Compacting a block graph is not allowed.') raise DGLError('Compacting a block graph is not allowed.')
assert all(g.device == F.cpu() for g in graphs), 'all the graphs must be on CPU'
# Ensure the node types are ordered the same. # Ensure the node types are ordered the same.
# TODO(BarclayII): we ideally need to remove this constraint. # TODO(BarclayII): we ideally need to remove this constraint.
...@@ -2026,8 +2025,8 @@ def compact_graphs(graphs, always_preserve=None, copy_ndata=True, copy_edata=Tru ...@@ -2026,8 +2025,8 @@ def compact_graphs(graphs, always_preserve=None, copy_ndata=True, copy_edata=Tru
ntypes, g.ntypes) ntypes, g.ntypes)
assert idtype == g.idtype, "Expect graph data type to be {}, but got {}".format( assert idtype == g.idtype, "Expect graph data type to be {}, but got {}".format(
idtype, g.idtype) idtype, g.idtype)
assert device == g.device, "Expect graph device to be {}, but got {}".format( assert device == g.device, "All graphs must be on the same devices." \
device, g.device) "Expect graph device to be {}, but got {}".format(device, g.device)
# Process the dictionary or tensor of "always preserve" nodes # Process the dictionary or tensor of "always preserve" nodes
if always_preserve is None: if always_preserve is None:
......
/*! /*!
* Copyright (c) 2019 by Contributors * Copyright 2019-2021 Contributors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* \file graph/transform/compact.cc * \file graph/transform/compact.cc
* \brief Compact graph implementation * \brief Compact graph implementation
*/ */
#include "compact.h"
#include <dgl/base_heterograph.h> #include <dgl/base_heterograph.h>
#include <dgl/transform.h> #include <dgl/transform.h>
#include <dgl/array.h> #include <dgl/array.h>
#include <dgl/packed_func_ext.h> #include <dgl/packed_func_ext.h>
#include <dgl/runtime/registry.h>
#include <dgl/runtime/container.h>
#include <vector> #include <vector>
#include <utility> #include <utility>
#include "../../c_api_common.h" #include "../../c_api_common.h"
...@@ -27,7 +44,7 @@ namespace { ...@@ -27,7 +44,7 @@ namespace {
template<typename IdType> template<typename IdType>
std::pair<std::vector<HeteroGraphPtr>, std::vector<IdArray>> std::pair<std::vector<HeteroGraphPtr>, std::vector<IdArray>>
CompactGraphs( CompactGraphsCPU(
const std::vector<HeteroGraphPtr> &graphs, const std::vector<HeteroGraphPtr> &graphs,
const std::vector<IdArray> &always_preserve) { const std::vector<IdArray> &always_preserve) {
// TODO(BarclayII): check whether the node space and metagraph of each graph is the same. // TODO(BarclayII): check whether the node space and metagraph of each graph is the same.
...@@ -121,17 +138,20 @@ CompactGraphs( ...@@ -121,17 +138,20 @@ CompactGraphs(
}; // namespace }; // namespace
template<>
std::pair<std::vector<HeteroGraphPtr>, std::vector<IdArray>> std::pair<std::vector<HeteroGraphPtr>, std::vector<IdArray>>
CompactGraphs( CompactGraphs<kDLCPU, int32_t>(
const std::vector<HeteroGraphPtr> &graphs, const std::vector<HeteroGraphPtr> &graphs,
const std::vector<IdArray> &always_preserve) { const std::vector<IdArray> &always_preserve) {
std::pair<std::vector<HeteroGraphPtr>, std::vector<IdArray>> result; return CompactGraphsCPU<int32_t>(graphs, always_preserve);
// TODO(BarclayII): check for all IdArrays }
CHECK(graphs[0]->DataType() == always_preserve[0]->dtype) << "data type mismatch.";
ATEN_ID_TYPE_SWITCH(graphs[0]->DataType(), IdType, { template<>
result = CompactGraphs<IdType>(graphs, always_preserve); std::pair<std::vector<HeteroGraphPtr>, std::vector<IdArray>>
}); CompactGraphs<kDLCPU, int64_t>(
return result; const std::vector<HeteroGraphPtr> &graphs,
const std::vector<IdArray> &always_preserve) {
return CompactGraphsCPU<int64_t>(graphs, always_preserve);
} }
DGL_REGISTER_GLOBAL("transform._CAPI_DGLCompactGraphs") DGL_REGISTER_GLOBAL("transform._CAPI_DGLCompactGraphs")
...@@ -146,7 +166,17 @@ DGL_REGISTER_GLOBAL("transform._CAPI_DGLCompactGraphs") ...@@ -146,7 +166,17 @@ DGL_REGISTER_GLOBAL("transform._CAPI_DGLCompactGraphs")
for (Value array : always_preserve_refs) for (Value array : always_preserve_refs)
always_preserve.push_back(array->data); always_preserve.push_back(array->data);
const auto &result_pair = CompactGraphs(graphs, always_preserve); // TODO(BarclayII): check for all IdArrays
CHECK(graphs[0]->DataType() == always_preserve[0]->dtype) << "data type mismatch.";
std::pair<std::vector<HeteroGraphPtr>, std::vector<IdArray>> result_pair;
ATEN_XPU_SWITCH_CUDA(graphs[0]->Context().device_type, XPU, "CompactGraphs", {
ATEN_ID_TYPE_SWITCH(graphs[0]->DataType(), IdType, {
result_pair = CompactGraphs<XPU, IdType>(
graphs, always_preserve);
});
});
List<HeteroGraphRef> compacted_graph_refs; List<HeteroGraphRef> compacted_graph_refs;
List<Value> induced_nodes; List<Value> induced_nodes;
......
/*!
* Copyright 2021 Contributors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* \file graph/transform/compact.h
* \brief Functions to find and eliminate the common isolated nodes across
* all given graphs with the same set of nodes.
*/
#ifndef DGL_GRAPH_TRANSFORM_COMPACT_H_
#define DGL_GRAPH_TRANSFORM_COMPACT_H_
#include <dgl/array.h>
#include <dgl/base_heterograph.h>
#include <vector>
#include <utility>
namespace dgl {
namespace transform {
/**
* @brief Given a list of graphs with the same set of nodes, find and eliminate
* the common isolated nodes across all graphs.
*
* @tparam XPU The type of device to operate on.
* @tparam IdType The type to use as an index.
* @param graphs The list of graphs to be compacted.
* @param always_preserve The vector of nodes to be preserved.
*
* @return The vector of compacted graphs and the vector of induced nodes.
*/
template<DLDeviceType XPU, typename IdType>
std::pair<std::vector<HeteroGraphPtr>, std::vector<IdArray>>
CompactGraphs(
const std::vector<HeteroGraphPtr> &graphs,
const std::vector<IdArray> &always_preserve);
} // namespace transform
} // namespace dgl
#endif // DGL_GRAPH_TRANSFORM_COMPACT_H_
/*!
* Copyright 2021 Contributors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* \file graph/transform/cuda/cuda_compact_graph.cu
* \brief Functions to find and eliminate the common isolated nodes across
* all given graphs with the same set of nodes.
*/
#include <dgl/runtime/device_api.h>
#include <dgl/immutable_graph.h>
#include <cuda_runtime.h>
#include <utility>
#include <algorithm>
#include <memory>
#include "../../../runtime/cuda/cuda_common.h"
#include "../../heterograph.h"
#include "../compact.h"
#include "cuda_map_edges.cuh"
using namespace dgl::aten;
using namespace dgl::runtime::cuda;
using namespace dgl::transform::cuda;
namespace dgl {
namespace transform {
namespace {
/**
* \brief This function builds node maps for each node type, preserving the
* order of the input nodes. Here it is assumed the nodes are not unique,
* and thus a unique list is generated.
*
* \param input_nodes The set of input nodes.
* \param node_maps The node maps to be constructed.
* \param count_unique_device The number of unique nodes (on the GPU).
* \param unique_nodes_device The unique nodes (on the GPU).
* \param stream The stream to operate on.
*/
template<typename IdType>
void BuildNodeMaps(
const std::vector<IdArray>& input_nodes,
DeviceNodeMap<IdType> * const node_maps,
int64_t * const count_unique_device,
std::vector<IdArray>* const unique_nodes_device,
cudaStream_t stream) {
const int64_t num_ntypes = static_cast<int64_t>(input_nodes.size());
CUDA_CALL(cudaMemsetAsync(
count_unique_device,
0,
num_ntypes*sizeof(*count_unique_device),
stream));
// possibly duplicated nodes
for (int64_t ntype = 0; ntype < num_ntypes; ++ntype) {
const IdArray& nodes = input_nodes[ntype];
if (nodes->shape[0] > 0) {
CHECK_EQ(nodes->ctx.device_type, kDLGPU);
node_maps->LhsHashTable(ntype).FillWithDuplicates(
nodes.Ptr<IdType>(),
nodes->shape[0],
(*unique_nodes_device)[ntype].Ptr<IdType>(),
count_unique_device+ntype,
stream);
}
}
}
template<typename IdType>
std::pair<std::vector<HeteroGraphPtr>, std::vector<IdArray>>
CompactGraphsGPU(
const std::vector<HeteroGraphPtr> &graphs,
const std::vector<IdArray> &always_preserve) {
cudaStream_t stream = 0;
const auto& ctx = graphs[0]->Context();
auto device = runtime::DeviceAPI::Get(ctx);
CHECK_EQ(ctx.device_type, kDLGPU);
// Step 1: Collect the nodes that has connections for each type.
const int64_t num_ntypes = graphs[0]->NumVertexTypes();
std::vector<std::vector<EdgeArray>> all_edges(graphs.size()); // all_edges[i][etype]
// count the number of nodes per type
std::vector<int64_t> max_vertex_cnt(num_ntypes, 0);
for (size_t i = 0; i < graphs.size(); ++i) {
const HeteroGraphPtr curr_graph = graphs[i];
const int64_t num_etypes = curr_graph->NumEdgeTypes();
for (IdType etype = 0; etype < num_etypes; ++etype) {
IdType srctype, dsttype;
std::tie(srctype, dsttype) = curr_graph->GetEndpointTypes(etype);
const int64_t n_edges = curr_graph->NumEdges(etype);
max_vertex_cnt[srctype] += n_edges;
max_vertex_cnt[dsttype] += n_edges;
}
}
for (size_t i = 0; i < always_preserve.size(); ++i) {
max_vertex_cnt[i] += always_preserve[i]->shape[0];
}
// gather all nodes
std::vector<IdArray> all_nodes(num_ntypes);
std::vector<int64_t> node_offsets(num_ntypes, 0);
for (int64_t ntype = 0; ntype < num_ntypes; ++ntype) {
all_nodes[ntype] = NewIdArray(max_vertex_cnt[ntype], ctx,
sizeof(IdType)*8);
// copy the nodes in always_preserve
if (ntype < always_preserve.size() && always_preserve[ntype]->shape[0] > 0) {
device->CopyDataFromTo(
always_preserve[ntype].Ptr<IdType>(), 0,
all_nodes[ntype].Ptr<IdType>(),
node_offsets[ntype],
sizeof(IdType)*always_preserve[ntype]->shape[0],
always_preserve[ntype]->ctx,
all_nodes[ntype]->ctx,
always_preserve[ntype]->dtype,
stream);
node_offsets[ntype] += sizeof(IdType)*always_preserve[ntype]->shape[0];
}
}
for (size_t i = 0; i < graphs.size(); ++i) {
const HeteroGraphPtr curr_graph = graphs[i];
const int64_t num_etypes = curr_graph->NumEdgeTypes();
all_edges[i].reserve(num_etypes);
for (int64_t etype = 0; etype < num_etypes; ++etype) {
dgl_type_t srctype, dsttype;
std::tie(srctype, dsttype) = curr_graph->GetEndpointTypes(etype);
const EdgeArray edges = curr_graph->Edges(etype, "eid");
if (edges.src.defined()) {
device->CopyDataFromTo(
edges.src.Ptr<IdType>(), 0,
all_nodes[srctype].Ptr<IdType>(),
node_offsets[srctype],
sizeof(IdType)*edges.src->shape[0],
edges.src->ctx,
all_nodes[srctype]->ctx,
edges.src->dtype,
stream);
node_offsets[srctype] += sizeof(IdType)*edges.src->shape[0];
}
if (edges.dst.defined()) {
device->CopyDataFromTo(
edges.dst.Ptr<IdType>(), 0,
all_nodes[dsttype].Ptr<IdType>(),
node_offsets[dsttype],
sizeof(IdType)*edges.dst->shape[0],
edges.dst->ctx,
all_nodes[dsttype]->ctx,
edges.dst->dtype,
stream);
node_offsets[dsttype] += sizeof(IdType)*edges.dst->shape[0];
}
all_edges[i].push_back(edges);
}
}
// Step 2: Relabel the nodes for each type to a smaller ID space
// using BuildNodeMaps
// allocate space for map creation
// the hashmap on GPU
DeviceNodeMap<IdType> node_maps(max_vertex_cnt, 0, ctx, stream);
// number of unique nodes per type on CPU
std::vector<int64_t> num_induced_nodes(num_ntypes);
// number of unique nodes per type on GPU
int64_t * count_unique_device = static_cast<int64_t*>(
device->AllocWorkspace(ctx, sizeof(int64_t)*num_ntypes));
// the set of unique nodes per type
std::vector<IdArray> induced_nodes(num_ntypes);
for (int64_t ntype = 0; ntype < num_ntypes; ++ntype) {
induced_nodes[ntype] = NewIdArray(max_vertex_cnt[ntype], ctx,
sizeof(IdType)*8);
}
BuildNodeMaps(
all_nodes,
&node_maps,
count_unique_device,
&induced_nodes,
stream);
device->CopyDataFromTo(
count_unique_device, 0,
num_induced_nodes.data(), 0,
sizeof(*num_induced_nodes.data())*num_ntypes,
ctx,
DGLContext{kDLCPU, 0},
DGLType{kDLInt, 64, 1},
stream);
device->StreamSync(ctx, stream);
// wait for the node counts to finish transferring
device->FreeWorkspace(ctx, count_unique_device);
// resize induced nodes
for (int64_t ntype = 0; ntype < num_ntypes; ++ntype) {
induced_nodes[ntype]->shape[0] = num_induced_nodes[ntype];
}
// Step 3: Remap the edges of each graph using MapEdges
std::vector<HeteroGraphPtr> new_graphs;
for (size_t i = 0; i < graphs.size(); ++i) {
const HeteroGraphPtr curr_graph = graphs[i];
const auto meta_graph = curr_graph->meta_graph();
const int64_t num_etypes = curr_graph->NumEdgeTypes();
std::vector<HeteroGraphPtr> rel_graphs;
rel_graphs.reserve(num_etypes);
std::vector<IdArray> new_src;
std::vector<IdArray> new_dst;
std::tie(new_src, new_dst) = MapEdges(
curr_graph, all_edges[i], node_maps, stream);
for (IdType etype = 0; etype < num_etypes; ++etype) {
IdType srctype, dsttype;
std::tie(srctype, dsttype) = curr_graph->GetEndpointTypes(etype);
rel_graphs.push_back(UnitGraph::CreateFromCOO(
srctype == dsttype ? 1 : 2,
induced_nodes[srctype]->shape[0],
induced_nodes[dsttype]->shape[0],
new_src[etype],
new_dst[etype]));
}
new_graphs.push_back(CreateHeteroGraph(meta_graph, rel_graphs, num_induced_nodes));
}
return std::make_pair(new_graphs, induced_nodes);
}
} // namespace
template<>
std::pair<std::vector<HeteroGraphPtr>, std::vector<IdArray>>
CompactGraphs<kDLGPU, int32_t>(
const std::vector<HeteroGraphPtr> &graphs,
const std::vector<IdArray> &always_preserve) {
return CompactGraphsGPU<int32_t>(graphs, always_preserve);
}
template<>
std::pair<std::vector<HeteroGraphPtr>, std::vector<IdArray>>
CompactGraphs<kDLGPU, int64_t>(
const std::vector<HeteroGraphPtr> &graphs,
const std::vector<IdArray> &always_preserve) {
return CompactGraphsGPU<int64_t>(graphs, always_preserve);
}
} // namespace transform
} // namespace dgl
/*!
* Copyright 2020-2021 Contributors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* \file graph/transform/cuda/cuda_map_edges.cuh
* \brief Device level functions for mapping edges.
*/
#ifndef DGL_GRAPH_TRANSFORM_CUDA_CUDA_MAP_EDGES_CUH_
#define DGL_GRAPH_TRANSFORM_CUDA_CUDA_MAP_EDGES_CUH_
#include <dgl/runtime/c_runtime_api.h>
#include <cuda_runtime.h>
#include "../../../runtime/cuda/cuda_common.h"
#include "../../../runtime/cuda/cuda_hashtable.cuh"
using namespace dgl::aten;
using namespace dgl::runtime::cuda;
namespace dgl {
namespace transform {
namespace cuda {
template<typename IdType, int BLOCK_SIZE, IdType TILE_SIZE>
__device__ void map_vertex_ids(
const IdType * const global,
IdType * const new_global,
const IdType num_vertices,
const DeviceOrderedHashTable<IdType>& table) {
assert(BLOCK_SIZE == blockDim.x);
using Mapping = typename OrderedHashTable<IdType>::Mapping;
const IdType tile_start = TILE_SIZE*blockIdx.x;
const IdType tile_end = min(TILE_SIZE*(blockIdx.x+1), num_vertices);
for (IdType idx = threadIdx.x+tile_start; idx < tile_end; idx+=BLOCK_SIZE) {
const Mapping& mapping = *table.Search(global[idx]);
new_global[idx] = mapping.local;
}
}
/**
* \brief Generate mapped edge endpoint ids.
*
* \tparam IdType The type of id.
* \tparam BLOCK_SIZE The size of each thread block.
* \tparam TILE_SIZE The number of edges to process per thread block.
* \param global_srcs_device The source ids to map.
* \param new_global_srcs_device The mapped source ids (output).
* \param global_dsts_device The destination ids to map.
* \param new_global_dsts_device The mapped destination ids (output).
* \param num_edges The number of edges to map.
* \param src_mapping The mapping of sources ids.
* \param src_hash_size The the size of source id hash table/mapping.
* \param dst_mapping The mapping of destination ids.
* \param dst_hash_size The the size of destination id hash table/mapping.
*/
template<typename IdType, int BLOCK_SIZE, IdType TILE_SIZE>
__global__ void map_edge_ids(
const IdType * const global_srcs_device,
IdType * const new_global_srcs_device,
const IdType * const global_dsts_device,
IdType * const new_global_dsts_device,
const IdType num_edges,
DeviceOrderedHashTable<IdType> src_mapping,
DeviceOrderedHashTable<IdType> dst_mapping) {
assert(BLOCK_SIZE == blockDim.x);
assert(2 == gridDim.y);
if (blockIdx.y == 0) {
map_vertex_ids<IdType, BLOCK_SIZE, TILE_SIZE>(
global_srcs_device,
new_global_srcs_device,
num_edges,
src_mapping);
} else {
map_vertex_ids<IdType, BLOCK_SIZE, TILE_SIZE>(
global_dsts_device,
new_global_dsts_device,
num_edges,
dst_mapping);
}
}
/**
* \brief Device level node maps for each node type.
*
* \param num_nodes Number of nodes per type.
* \param offset When offset is set to 0, LhsHashTable is identical to RhsHashTable.
* Or set to num_nodes.size()/2 to use seperated LhsHashTable and RhsHashTable.
* \param ctx The DGL context.
* \param stream The stream to operate on.
*/
template<typename IdType>
class DeviceNodeMap {
public:
using Mapping = typename OrderedHashTable<IdType>::Mapping;
DeviceNodeMap(
const std::vector<int64_t>& num_nodes,
const int64_t offset,
DGLContext ctx,
cudaStream_t stream) :
num_types_(num_nodes.size()),
rhs_offset_(offset),
workspaces_(),
hash_tables_(),
ctx_(ctx) {
auto device = runtime::DeviceAPI::Get(ctx);
hash_tables_.reserve(num_types_);
workspaces_.reserve(num_types_);
for (int64_t i = 0; i < num_types_; ++i) {
hash_tables_.emplace_back(
new OrderedHashTable<IdType>(
num_nodes[i],
ctx_,
stream));
}
}
OrderedHashTable<IdType>& LhsHashTable(
const size_t index) {
return HashData(index);
}
OrderedHashTable<IdType>& RhsHashTable(
const size_t index) {
return HashData(index+rhs_offset_);
}
const OrderedHashTable<IdType>& LhsHashTable(
const size_t index) const {
return HashData(index);
}
const OrderedHashTable<IdType>& RhsHashTable(
const size_t index) const {
return HashData(index+rhs_offset_);
}
IdType LhsHashSize(
const size_t index) const {
return HashSize(index);
}
IdType RhsHashSize(
const size_t index) const {
return HashSize(rhs_offset_+index);
}
size_t Size() const {
return hash_tables_.size();
}
private:
int64_t num_types_;
size_t rhs_offset_;
std::vector<void*> workspaces_;
std::vector<std::unique_ptr<OrderedHashTable<IdType>>> hash_tables_;
DGLContext ctx_;
inline OrderedHashTable<IdType>& HashData(
const size_t index) {
CHECK_LT(index, hash_tables_.size());
return *hash_tables_[index];
}
inline const OrderedHashTable<IdType>& HashData(
const size_t index) const {
CHECK_LT(index, hash_tables_.size());
return *hash_tables_[index];
}
inline IdType HashSize(
const size_t index) const {
return HashData(index).size();
}
};
template<typename IdType>
inline size_t RoundUpDiv(
const IdType num,
const size_t divisor) {
return static_cast<IdType>(num/divisor) + (num % divisor == 0 ? 0 : 1);
}
template<typename IdType>
inline IdType RoundUp(
const IdType num,
const size_t unit) {
return RoundUpDiv(num, unit)*unit;
}
template<typename IdType>
std::tuple<std::vector<IdArray>, std::vector<IdArray>>
MapEdges(
HeteroGraphPtr graph,
const std::vector<EdgeArray>& edge_sets,
const DeviceNodeMap<IdType>& node_map,
cudaStream_t stream) {
constexpr const int BLOCK_SIZE = 128;
constexpr const size_t TILE_SIZE = 1024;
const auto& ctx = graph->Context();
std::vector<IdArray> new_lhs;
new_lhs.reserve(edge_sets.size());
std::vector<IdArray> new_rhs;
new_rhs.reserve(edge_sets.size());
// The next peformance optimization here, is to perform mapping of all edge
// types in a single kernel launch.
const int64_t num_edge_sets = static_cast<int64_t>(edge_sets.size());
for (int64_t etype = 0; etype < num_edge_sets; ++etype) {
const EdgeArray& edges = edge_sets[etype];
if (edges.id.defined() && edges.src->shape[0] > 0) {
const int64_t num_edges = edges.src->shape[0];
new_lhs.emplace_back(NewIdArray(num_edges, ctx, sizeof(IdType)*8));
new_rhs.emplace_back(NewIdArray(num_edges, ctx, sizeof(IdType)*8));
const auto src_dst_types = graph->GetEndpointTypes(etype);
const int src_type = src_dst_types.first;
const int dst_type = src_dst_types.second;
const dim3 grid(RoundUpDiv(num_edges, TILE_SIZE), 2);
const dim3 block(BLOCK_SIZE);
// map the srcs
map_edge_ids<IdType, BLOCK_SIZE, TILE_SIZE><<<
grid,
block,
0,
stream>>>(
edges.src.Ptr<IdType>(),
new_lhs.back().Ptr<IdType>(),
edges.dst.Ptr<IdType>(),
new_rhs.back().Ptr<IdType>(),
num_edges,
node_map.LhsHashTable(src_type).DeviceHandle(),
node_map.RhsHashTable(dst_type).DeviceHandle());
CUDA_CALL(cudaGetLastError());
} else {
new_lhs.emplace_back(
aten::NullArray(DLDataType{kDLInt, sizeof(IdType)*8, 1}, ctx));
new_rhs.emplace_back(
aten::NullArray(DLDataType{kDLInt, sizeof(IdType)*8, 1}, ctx));
}
}
return std::tuple<std::vector<IdArray>, std::vector<IdArray>>(
std::move(new_lhs), std::move(new_rhs));
}
} // namespace cuda
} // namespace transform
} // namespace dgl
#endif
...@@ -13,7 +13,7 @@ ...@@ -13,7 +13,7 @@
* See the License for the specific language governing permissions and * See the License for the specific language governing permissions and
* limitations under the License. * limitations under the License.
* *
* \file graph/transform/cuda_to_block.cu * \file graph/transform/cuda/cuda_to_block.cu
* \brief Functions to convert a set of edges into a graph block with local * \brief Functions to convert a set of edges into a graph block with local
* ids. * ids.
*/ */
...@@ -27,182 +27,19 @@ ...@@ -27,182 +27,19 @@
#include <memory> #include <memory>
#include "../../../runtime/cuda/cuda_common.h" #include "../../../runtime/cuda/cuda_common.h"
#include "../../../runtime/cuda/cuda_hashtable.cuh"
#include "../../heterograph.h" #include "../../heterograph.h"
#include "../to_bipartite.h" #include "../to_bipartite.h"
#include "cuda_map_edges.cuh"
using namespace dgl::aten; using namespace dgl::aten;
using namespace dgl::runtime::cuda; using namespace dgl::runtime::cuda;
using namespace dgl::transform::cuda;
namespace dgl { namespace dgl {
namespace transform { namespace transform {
namespace { namespace {
template<typename IdType, int BLOCK_SIZE, IdType TILE_SIZE>
__device__ void map_vertex_ids(
const IdType * const global,
IdType * const new_global,
const IdType num_vertices,
const DeviceOrderedHashTable<IdType>& table) {
assert(BLOCK_SIZE == blockDim.x);
using Mapping = typename OrderedHashTable<IdType>::Mapping;
const IdType tile_start = TILE_SIZE*blockIdx.x;
const IdType tile_end = min(TILE_SIZE*(blockIdx.x+1), num_vertices);
for (IdType idx = threadIdx.x+tile_start; idx < tile_end; idx+=BLOCK_SIZE) {
const Mapping& mapping = *table.Search(global[idx]);
new_global[idx] = mapping.local;
}
}
/**
* \brief Generate mapped edge endpoint ids.
*
* \tparam IdType The type of id.
* \tparam BLOCK_SIZE The size of each thread block.
* \tparam TILE_SIZE The number of edges to process per thread block.
* \param global_srcs_device The source ids to map.
* \param new_global_srcs_device The mapped source ids (output).
* \param global_dsts_device The destination ids to map.
* \param new_global_dsts_device The mapped destination ids (output).
* \param num_edges The number of edges to map.
* \param src_mapping The mapping of sources ids.
* \param src_hash_size The the size of source id hash table/mapping.
* \param dst_mapping The mapping of destination ids.
* \param dst_hash_size The the size of destination id hash table/mapping.
*/
template<typename IdType, int BLOCK_SIZE, IdType TILE_SIZE>
__global__ void map_edge_ids(
const IdType * const global_srcs_device,
IdType * const new_global_srcs_device,
const IdType * const global_dsts_device,
IdType * const new_global_dsts_device,
const IdType num_edges,
DeviceOrderedHashTable<IdType> src_mapping,
DeviceOrderedHashTable<IdType> dst_mapping) {
assert(BLOCK_SIZE == blockDim.x);
assert(2 == gridDim.y);
if (blockIdx.y == 0) {
map_vertex_ids<IdType, BLOCK_SIZE, TILE_SIZE>(
global_srcs_device,
new_global_srcs_device,
num_edges,
src_mapping);
} else {
map_vertex_ids<IdType, BLOCK_SIZE, TILE_SIZE>(
global_dsts_device,
new_global_dsts_device,
num_edges,
dst_mapping);
}
}
template<typename IdType>
inline size_t RoundUpDiv(
const IdType num,
const size_t divisor) {
return static_cast<IdType>(num/divisor) + (num % divisor == 0 ? 0 : 1);
}
template<typename IdType>
inline IdType RoundUp(
const IdType num,
const size_t unit) {
return RoundUpDiv(num, unit)*unit;
}
template<typename IdType>
class DeviceNodeMap {
public:
using Mapping = typename OrderedHashTable<IdType>::Mapping;
DeviceNodeMap(
const std::vector<int64_t>& num_nodes,
DGLContext ctx,
cudaStream_t stream) :
num_types_(num_nodes.size()),
rhs_offset_(num_types_/2),
workspaces_(),
hash_tables_(),
ctx_(ctx) {
auto device = runtime::DeviceAPI::Get(ctx);
hash_tables_.reserve(num_types_);
workspaces_.reserve(num_types_);
for (int64_t i = 0; i < num_types_; ++i) {
hash_tables_.emplace_back(
new OrderedHashTable<IdType>(
num_nodes[i],
ctx_,
stream));
}
}
OrderedHashTable<IdType>& LhsHashTable(
const size_t index) {
return HashData(index);
}
OrderedHashTable<IdType>& RhsHashTable(
const size_t index) {
return HashData(index+rhs_offset_);
}
const OrderedHashTable<IdType>& LhsHashTable(
const size_t index) const {
return HashData(index);
}
const OrderedHashTable<IdType>& RhsHashTable(
const size_t index) const {
return HashData(index+rhs_offset_);
}
IdType LhsHashSize(
const size_t index) const {
return HashSize(index);
}
IdType RhsHashSize(
const size_t index) const {
return HashSize(rhs_offset_+index);
}
size_t Size() const {
return hash_tables_.size();
}
private:
int64_t num_types_;
size_t rhs_offset_;
std::vector<void*> workspaces_;
std::vector<std::unique_ptr<OrderedHashTable<IdType>>> hash_tables_;
DGLContext ctx_;
inline OrderedHashTable<IdType>& HashData(
const size_t index) {
CHECK_LT(index, hash_tables_.size());
return *hash_tables_[index];
}
inline const OrderedHashTable<IdType>& HashData(
const size_t index) const {
CHECK_LT(index, hash_tables_.size());
return *hash_tables_[index];
}
inline IdType HashSize(
const size_t index) const {
return HashData(index).size();
}
};
template<typename IdType> template<typename IdType>
class DeviceNodeMapMaker { class DeviceNodeMapMaker {
public: public:
...@@ -315,67 +152,6 @@ class DeviceNodeMapMaker { ...@@ -315,67 +152,6 @@ class DeviceNodeMapMaker {
IdType max_num_nodes_; IdType max_num_nodes_;
}; };
template<typename IdType>
std::tuple<std::vector<IdArray>, std::vector<IdArray>>
MapEdges(
HeteroGraphPtr graph,
const std::vector<EdgeArray>& edge_sets,
const DeviceNodeMap<IdType>& node_map,
cudaStream_t stream) {
constexpr const int BLOCK_SIZE = 128;
constexpr const size_t TILE_SIZE = 1024;
const auto& ctx = graph->Context();
std::vector<IdArray> new_lhs;
new_lhs.reserve(edge_sets.size());
std::vector<IdArray> new_rhs;
new_rhs.reserve(edge_sets.size());
// The next peformance optimization here, is to perform mapping of all edge
// types in a single kernel launch.
const int64_t num_edge_sets = static_cast<int64_t>(edge_sets.size());
for (int64_t etype = 0; etype < num_edge_sets; ++etype) {
const EdgeArray& edges = edge_sets[etype];
if (edges.id.defined() && edges.src->shape[0] > 0) {
const int64_t num_edges = edges.src->shape[0];
new_lhs.emplace_back(NewIdArray(num_edges, ctx, sizeof(IdType)*8));
new_rhs.emplace_back(NewIdArray(num_edges, ctx, sizeof(IdType)*8));
const auto src_dst_types = graph->GetEndpointTypes(etype);
const int src_type = src_dst_types.first;
const int dst_type = src_dst_types.second;
const dim3 grid(RoundUpDiv(num_edges, TILE_SIZE), 2);
const dim3 block(BLOCK_SIZE);
// map the srcs
map_edge_ids<IdType, BLOCK_SIZE, TILE_SIZE><<<
grid,
block,
0,
stream>>>(
edges.src.Ptr<IdType>(),
new_lhs.back().Ptr<IdType>(),
edges.dst.Ptr<IdType>(),
new_rhs.back().Ptr<IdType>(),
num_edges,
node_map.LhsHashTable(src_type).DeviceHandle(),
node_map.RhsHashTable(dst_type).DeviceHandle());
CUDA_CALL(cudaGetLastError());
} else {
new_lhs.emplace_back(
aten::NullArray(DLDataType{kDLInt, sizeof(IdType)*8, 1}, ctx));
new_rhs.emplace_back(
aten::NullArray(DLDataType{kDLInt, sizeof(IdType)*8, 1}, ctx));
}
}
return std::tuple<std::vector<IdArray>, std::vector<IdArray>>(
std::move(new_lhs), std::move(new_rhs));
}
// Since partial specialization is not allowed for functions, use this as an // Since partial specialization is not allowed for functions, use this as an
// intermediate for ToBlock where XPU = kDLGPU. // intermediate for ToBlock where XPU = kDLGPU.
...@@ -487,7 +263,7 @@ ToBlockGPU( ...@@ -487,7 +263,7 @@ ToBlockGPU(
// allocate space for map creation process // allocate space for map creation process
DeviceNodeMapMaker<IdType> maker(maxNodesPerType); DeviceNodeMapMaker<IdType> maker(maxNodesPerType);
DeviceNodeMap<IdType> node_maps(maxNodesPerType, ctx, stream); DeviceNodeMap<IdType> node_maps(maxNodesPerType, num_ntypes, ctx, stream);
if (generate_lhs_nodes) { if (generate_lhs_nodes) {
lhs_nodes.reserve(num_ntypes); lhs_nodes.reserve(num_ntypes);
......
/*!
* Copyright (c) 2020 by Contributors
* \file graph/transform/cuda_to_block.h
* \brief Functions to convert a set of edges into a graph block with local
* ids.
*/
#ifndef DGL_GRAPH_TRANSFORM_CUDA_CUDA_TO_BLOCK_H_
#define DGL_GRAPH_TRANSFORM_CUDA_CUDA_TO_BLOCK_H_
#include <dgl/array.h>
#include <dgl/base_heterograph.h>
#include <vector>
#include <tuple>
namespace dgl {
namespace transform {
namespace cuda {
/**
* @brief Generate a subgraph with locally numbered vertices, from the given
* edge set.
*
* @param graph The set of edges to construct the subgraph from.
* @param rhs_nodes The unique set of destination vertices.
* @param include_rhs_in_lhs Whether or not to include the `rhs_nodes` in the
* set of source vertices for purposes of local numbering.
*
* @return The subgraph, the unique set of source nodes, and the mapping of
* subgraph edges to global edges.
*/
std::tuple<HeteroGraphPtr, std::vector<IdArray>, std::vector<IdArray>>
CudaToBlock(
HeteroGraphPtr graph,
const std::vector<IdArray>& rhs_nodes,
const bool include_rhs_in_lhs);
} // namespace cuda
} // namespace transform
} // namespace dgl
#endif // DGL_GRAPH_TRANSFORM_CUDA_CUDA_TO_BLOCK_H_
...@@ -13,7 +13,7 @@ ...@@ -13,7 +13,7 @@
* See the License for the specific language governing permissions and * See the License for the specific language governing permissions and
* limitations under the License. * limitations under the License.
* *
* \file graph/transform/cuda_to_block.cu * \file graph/transform/to_bipartite.h
* \brief Functions to convert a set of edges into a graph block with local * \brief Functions to convert a set of edges into a graph block with local
* ids. * ids.
*/ */
......
...@@ -662,24 +662,23 @@ def test_reorder_nodes(): ...@@ -662,24 +662,23 @@ def test_reorder_nodes():
old_neighs2 = g.predecessors(old_nid) old_neighs2 = g.predecessors(old_nid)
assert np.all(np.sort(old_neighs1) == np.sort(F.asnumpy(old_neighs2))) assert np.all(np.sort(old_neighs1) == np.sort(F.asnumpy(old_neighs2)))
@unittest.skipIf(F._default_context_str == 'gpu', reason="GPU compaction not implemented")
@parametrize_dtype @parametrize_dtype
def test_compact(idtype): def test_compact(idtype):
g1 = dgl.heterograph({ g1 = dgl.heterograph({
('user', 'follow', 'user'): ([1, 3], [3, 5]), ('user', 'follow', 'user'): ([1, 3], [3, 5]),
('user', 'plays', 'game'): ([2, 3, 2], [4, 4, 5]), ('user', 'plays', 'game'): ([2, 3, 2], [4, 4, 5]),
('game', 'wished-by', 'user'): ([6, 5], [7, 7])}, ('game', 'wished-by', 'user'): ([6, 5], [7, 7])},
{'user': 20, 'game': 10}, idtype=idtype) {'user': 20, 'game': 10}, idtype=idtype, device=F.ctx())
g2 = dgl.heterograph({ g2 = dgl.heterograph({
('game', 'clicked-by', 'user'): ([3], [1]), ('game', 'clicked-by', 'user'): ([3], [1]),
('user', 'likes', 'user'): ([1, 8], [8, 9])}, ('user', 'likes', 'user'): ([1, 8], [8, 9])},
{'user': 20, 'game': 10}, idtype=idtype) {'user': 20, 'game': 10}, idtype=idtype, device=F.ctx())
g3 = dgl.heterograph({('user', '_E', 'user'): ((0, 1), (1, 2))}, g3 = dgl.heterograph({('user', '_E', 'user'): ((0, 1), (1, 2))},
{'user': 10}, idtype=idtype) {'user': 10}, idtype=idtype, device=F.ctx())
g4 = dgl.heterograph({('user', '_E', 'user'): ((1, 3), (3, 5))}, g4 = dgl.heterograph({('user', '_E', 'user'): ((1, 3), (3, 5))},
{'user': 10}, idtype=idtype) {'user': 10}, idtype=idtype, device=F.ctx())
def _check(g, new_g, induced_nodes): def _check(g, new_g, induced_nodes):
assert g.ntypes == new_g.ntypes assert g.ntypes == new_g.ntypes
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment