Unverified Commit a8c81018 authored by Xin Yao's avatar Xin Yao Committed by GitHub
Browse files

[Sampling] Implement dgl.compact_graphs() for the GPU (#3423)

* gpu compact graph template

* cuda compact graph draft

* fix typo

* compact graphs

* pass unit test but fail in training

* example using EdgeDataLoader on the GPU

* refactor cuda_compact_graph and cuda_to_block

* update training scripts

* fix linting

* fix linting

* fix exclude_edges for the GPU

* add --data-cpu & fix copyright
parent 308e52a3
......@@ -67,6 +67,9 @@ def run(proc_id, n_gpus, args, devices, data):
train_mask, val_mask, test_mask, n_classes, g = data
nfeat = g.ndata.pop('feat')
labels = g.ndata.pop('label')
if not args.data_cpu:
nfeat = nfeat.to(device)
labels = labels.to(device)
in_feats = nfeat.shape[1]
train_nid = th.LongTensor(np.nonzero(train_mask)).squeeze()
......@@ -77,6 +80,11 @@ def run(proc_id, n_gpus, args, devices, data):
n_edges = g.num_edges()
train_seeds = th.arange(n_edges)
if args.sample_gpu:
assert n_gpus > 0, "Must have GPUs to enable GPU sampling"
train_seeds = train_seeds.to(device)
g = g.to(device)
# Create sampler
sampler = dgl.dataloading.MultiLayerNeighborSampler(
[int(fanout) for fanout in args.fan_out.split(',')])
......@@ -121,11 +129,11 @@ def run(proc_id, n_gpus, args, devices, data):
tic_step = time.time()
for step, (input_nodes, pos_graph, neg_graph, blocks) in enumerate(dataloader):
batch_inputs = nfeat[input_nodes].to(device)
d_step = time.time()
pos_graph = pos_graph.to(device)
neg_graph = neg_graph.to(device)
blocks = [block.int().to(device) for block in blocks]
d_step = time.time()
# Compute loss and prediction
batch_pred = model(blocks, batch_inputs)
loss = loss_fcn(batch_pred, pos_graph, neg_graph)
......@@ -213,6 +221,13 @@ if __name__ == '__main__':
argparser.add_argument('--dropout', type=float, default=0.5)
argparser.add_argument('--num-workers', type=int, default=0,
help="Number of sampling processes. Use 0 for no extra process.")
argparser.add_argument('--sample-gpu', action='store_true',
help="Perform the sampling process on the GPU. Must have 0 workers.")
argparser.add_argument('--data-cpu', action='store_true',
help="By default the script puts all node features and labels "
"on GPU when using it to save time for data copy. This may "
"be undesired if they cannot fit in GPU memory at once. "
"This flag disables that.")
args = argparser.parse_args()
devices = list(map(int, args.gpu.split(',')))
......
......@@ -368,7 +368,7 @@ class BlockSampler(object):
else:
seed_nodes_in = seed_nodes_in.to(graph_device)
if self.exclude_edges_in_frontier:
if self.exclude_edges_in_frontier(g):
frontier = self.sample_frontier(
block_id, g, seed_nodes_in, exclude_eids=exclude_eids)
else:
......
......@@ -1913,7 +1913,7 @@ def compact_graphs(graphs, always_preserve=None, copy_ndata=True, copy_edata=Tru
graphs : DGLGraph or list[DGLGraph]
The graph, or list of graphs.
All graphs must be on CPU.
All graphs must be on the same devices.
All graphs must have the same set of nodes.
always_preserve : Tensor or dict[str, Tensor], optional
......@@ -2013,7 +2013,6 @@ def compact_graphs(graphs, always_preserve=None, copy_ndata=True, copy_edata=Tru
return []
if graphs[0].is_block:
raise DGLError('Compacting a block graph is not allowed.')
assert all(g.device == F.cpu() for g in graphs), 'all the graphs must be on CPU'
# Ensure the node types are ordered the same.
# TODO(BarclayII): we ideally need to remove this constraint.
......@@ -2026,8 +2025,8 @@ def compact_graphs(graphs, always_preserve=None, copy_ndata=True, copy_edata=Tru
ntypes, g.ntypes)
assert idtype == g.idtype, "Expect graph data type to be {}, but got {}".format(
idtype, g.idtype)
assert device == g.device, "Expect graph device to be {}, but got {}".format(
device, g.device)
assert device == g.device, "All graphs must be on the same devices." \
"Expect graph device to be {}, but got {}".format(device, g.device)
# Process the dictionary or tensor of "always preserve" nodes
if always_preserve is None:
......
/*!
* Copyright (c) 2019 by Contributors
* Copyright 2019-2021 Contributors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* \file graph/transform/compact.cc
* \brief Compact graph implementation
*/
#include "compact.h"
#include <dgl/base_heterograph.h>
#include <dgl/transform.h>
#include <dgl/array.h>
#include <dgl/packed_func_ext.h>
#include <dgl/runtime/registry.h>
#include <dgl/runtime/container.h>
#include <vector>
#include <utility>
#include "../../c_api_common.h"
......@@ -27,7 +44,7 @@ namespace {
template<typename IdType>
std::pair<std::vector<HeteroGraphPtr>, std::vector<IdArray>>
CompactGraphs(
CompactGraphsCPU(
const std::vector<HeteroGraphPtr> &graphs,
const std::vector<IdArray> &always_preserve) {
// TODO(BarclayII): check whether the node space and metagraph of each graph is the same.
......@@ -121,17 +138,20 @@ CompactGraphs(
}; // namespace
template<>
std::pair<std::vector<HeteroGraphPtr>, std::vector<IdArray>>
CompactGraphs(
CompactGraphs<kDLCPU, int32_t>(
const std::vector<HeteroGraphPtr> &graphs,
const std::vector<IdArray> &always_preserve) {
std::pair<std::vector<HeteroGraphPtr>, std::vector<IdArray>> result;
// TODO(BarclayII): check for all IdArrays
CHECK(graphs[0]->DataType() == always_preserve[0]->dtype) << "data type mismatch.";
ATEN_ID_TYPE_SWITCH(graphs[0]->DataType(), IdType, {
result = CompactGraphs<IdType>(graphs, always_preserve);
});
return result;
return CompactGraphsCPU<int32_t>(graphs, always_preserve);
}
template<>
std::pair<std::vector<HeteroGraphPtr>, std::vector<IdArray>>
CompactGraphs<kDLCPU, int64_t>(
const std::vector<HeteroGraphPtr> &graphs,
const std::vector<IdArray> &always_preserve) {
return CompactGraphsCPU<int64_t>(graphs, always_preserve);
}
DGL_REGISTER_GLOBAL("transform._CAPI_DGLCompactGraphs")
......@@ -146,7 +166,17 @@ DGL_REGISTER_GLOBAL("transform._CAPI_DGLCompactGraphs")
for (Value array : always_preserve_refs)
always_preserve.push_back(array->data);
const auto &result_pair = CompactGraphs(graphs, always_preserve);
// TODO(BarclayII): check for all IdArrays
CHECK(graphs[0]->DataType() == always_preserve[0]->dtype) << "data type mismatch.";
std::pair<std::vector<HeteroGraphPtr>, std::vector<IdArray>> result_pair;
ATEN_XPU_SWITCH_CUDA(graphs[0]->Context().device_type, XPU, "CompactGraphs", {
ATEN_ID_TYPE_SWITCH(graphs[0]->DataType(), IdType, {
result_pair = CompactGraphs<XPU, IdType>(
graphs, always_preserve);
});
});
List<HeteroGraphRef> compacted_graph_refs;
List<Value> induced_nodes;
......
/*!
* Copyright 2021 Contributors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* \file graph/transform/compact.h
* \brief Functions to find and eliminate the common isolated nodes across
* all given graphs with the same set of nodes.
*/
#ifndef DGL_GRAPH_TRANSFORM_COMPACT_H_
#define DGL_GRAPH_TRANSFORM_COMPACT_H_
#include <dgl/array.h>
#include <dgl/base_heterograph.h>
#include <vector>
#include <utility>
namespace dgl {
namespace transform {
/**
* @brief Given a list of graphs with the same set of nodes, find and eliminate
* the common isolated nodes across all graphs.
*
* @tparam XPU The type of device to operate on.
* @tparam IdType The type to use as an index.
* @param graphs The list of graphs to be compacted.
* @param always_preserve The vector of nodes to be preserved.
*
* @return The vector of compacted graphs and the vector of induced nodes.
*/
template<DLDeviceType XPU, typename IdType>
std::pair<std::vector<HeteroGraphPtr>, std::vector<IdArray>>
CompactGraphs(
const std::vector<HeteroGraphPtr> &graphs,
const std::vector<IdArray> &always_preserve);
} // namespace transform
} // namespace dgl
#endif // DGL_GRAPH_TRANSFORM_COMPACT_H_
/*!
* Copyright 2021 Contributors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* \file graph/transform/cuda/cuda_compact_graph.cu
* \brief Functions to find and eliminate the common isolated nodes across
* all given graphs with the same set of nodes.
*/
#include <dgl/runtime/device_api.h>
#include <dgl/immutable_graph.h>
#include <cuda_runtime.h>
#include <utility>
#include <algorithm>
#include <memory>
#include "../../../runtime/cuda/cuda_common.h"
#include "../../heterograph.h"
#include "../compact.h"
#include "cuda_map_edges.cuh"
using namespace dgl::aten;
using namespace dgl::runtime::cuda;
using namespace dgl::transform::cuda;
namespace dgl {
namespace transform {
namespace {
/**
* \brief This function builds node maps for each node type, preserving the
* order of the input nodes. Here it is assumed the nodes are not unique,
* and thus a unique list is generated.
*
* \param input_nodes The set of input nodes.
* \param node_maps The node maps to be constructed.
* \param count_unique_device The number of unique nodes (on the GPU).
* \param unique_nodes_device The unique nodes (on the GPU).
* \param stream The stream to operate on.
*/
template<typename IdType>
void BuildNodeMaps(
const std::vector<IdArray>& input_nodes,
DeviceNodeMap<IdType> * const node_maps,
int64_t * const count_unique_device,
std::vector<IdArray>* const unique_nodes_device,
cudaStream_t stream) {
const int64_t num_ntypes = static_cast<int64_t>(input_nodes.size());
CUDA_CALL(cudaMemsetAsync(
count_unique_device,
0,
num_ntypes*sizeof(*count_unique_device),
stream));
// possibly duplicated nodes
for (int64_t ntype = 0; ntype < num_ntypes; ++ntype) {
const IdArray& nodes = input_nodes[ntype];
if (nodes->shape[0] > 0) {
CHECK_EQ(nodes->ctx.device_type, kDLGPU);
node_maps->LhsHashTable(ntype).FillWithDuplicates(
nodes.Ptr<IdType>(),
nodes->shape[0],
(*unique_nodes_device)[ntype].Ptr<IdType>(),
count_unique_device+ntype,
stream);
}
}
}
template<typename IdType>
std::pair<std::vector<HeteroGraphPtr>, std::vector<IdArray>>
CompactGraphsGPU(
const std::vector<HeteroGraphPtr> &graphs,
const std::vector<IdArray> &always_preserve) {
cudaStream_t stream = 0;
const auto& ctx = graphs[0]->Context();
auto device = runtime::DeviceAPI::Get(ctx);
CHECK_EQ(ctx.device_type, kDLGPU);
// Step 1: Collect the nodes that has connections for each type.
const int64_t num_ntypes = graphs[0]->NumVertexTypes();
std::vector<std::vector<EdgeArray>> all_edges(graphs.size()); // all_edges[i][etype]
// count the number of nodes per type
std::vector<int64_t> max_vertex_cnt(num_ntypes, 0);
for (size_t i = 0; i < graphs.size(); ++i) {
const HeteroGraphPtr curr_graph = graphs[i];
const int64_t num_etypes = curr_graph->NumEdgeTypes();
for (IdType etype = 0; etype < num_etypes; ++etype) {
IdType srctype, dsttype;
std::tie(srctype, dsttype) = curr_graph->GetEndpointTypes(etype);
const int64_t n_edges = curr_graph->NumEdges(etype);
max_vertex_cnt[srctype] += n_edges;
max_vertex_cnt[dsttype] += n_edges;
}
}
for (size_t i = 0; i < always_preserve.size(); ++i) {
max_vertex_cnt[i] += always_preserve[i]->shape[0];
}
// gather all nodes
std::vector<IdArray> all_nodes(num_ntypes);
std::vector<int64_t> node_offsets(num_ntypes, 0);
for (int64_t ntype = 0; ntype < num_ntypes; ++ntype) {
all_nodes[ntype] = NewIdArray(max_vertex_cnt[ntype], ctx,
sizeof(IdType)*8);
// copy the nodes in always_preserve
if (ntype < always_preserve.size() && always_preserve[ntype]->shape[0] > 0) {
device->CopyDataFromTo(
always_preserve[ntype].Ptr<IdType>(), 0,
all_nodes[ntype].Ptr<IdType>(),
node_offsets[ntype],
sizeof(IdType)*always_preserve[ntype]->shape[0],
always_preserve[ntype]->ctx,
all_nodes[ntype]->ctx,
always_preserve[ntype]->dtype,
stream);
node_offsets[ntype] += sizeof(IdType)*always_preserve[ntype]->shape[0];
}
}
for (size_t i = 0; i < graphs.size(); ++i) {
const HeteroGraphPtr curr_graph = graphs[i];
const int64_t num_etypes = curr_graph->NumEdgeTypes();
all_edges[i].reserve(num_etypes);
for (int64_t etype = 0; etype < num_etypes; ++etype) {
dgl_type_t srctype, dsttype;
std::tie(srctype, dsttype) = curr_graph->GetEndpointTypes(etype);
const EdgeArray edges = curr_graph->Edges(etype, "eid");
if (edges.src.defined()) {
device->CopyDataFromTo(
edges.src.Ptr<IdType>(), 0,
all_nodes[srctype].Ptr<IdType>(),
node_offsets[srctype],
sizeof(IdType)*edges.src->shape[0],
edges.src->ctx,
all_nodes[srctype]->ctx,
edges.src->dtype,
stream);
node_offsets[srctype] += sizeof(IdType)*edges.src->shape[0];
}
if (edges.dst.defined()) {
device->CopyDataFromTo(
edges.dst.Ptr<IdType>(), 0,
all_nodes[dsttype].Ptr<IdType>(),
node_offsets[dsttype],
sizeof(IdType)*edges.dst->shape[0],
edges.dst->ctx,
all_nodes[dsttype]->ctx,
edges.dst->dtype,
stream);
node_offsets[dsttype] += sizeof(IdType)*edges.dst->shape[0];
}
all_edges[i].push_back(edges);
}
}
// Step 2: Relabel the nodes for each type to a smaller ID space
// using BuildNodeMaps
// allocate space for map creation
// the hashmap on GPU
DeviceNodeMap<IdType> node_maps(max_vertex_cnt, 0, ctx, stream);
// number of unique nodes per type on CPU
std::vector<int64_t> num_induced_nodes(num_ntypes);
// number of unique nodes per type on GPU
int64_t * count_unique_device = static_cast<int64_t*>(
device->AllocWorkspace(ctx, sizeof(int64_t)*num_ntypes));
// the set of unique nodes per type
std::vector<IdArray> induced_nodes(num_ntypes);
for (int64_t ntype = 0; ntype < num_ntypes; ++ntype) {
induced_nodes[ntype] = NewIdArray(max_vertex_cnt[ntype], ctx,
sizeof(IdType)*8);
}
BuildNodeMaps(
all_nodes,
&node_maps,
count_unique_device,
&induced_nodes,
stream);
device->CopyDataFromTo(
count_unique_device, 0,
num_induced_nodes.data(), 0,
sizeof(*num_induced_nodes.data())*num_ntypes,
ctx,
DGLContext{kDLCPU, 0},
DGLType{kDLInt, 64, 1},
stream);
device->StreamSync(ctx, stream);
// wait for the node counts to finish transferring
device->FreeWorkspace(ctx, count_unique_device);
// resize induced nodes
for (int64_t ntype = 0; ntype < num_ntypes; ++ntype) {
induced_nodes[ntype]->shape[0] = num_induced_nodes[ntype];
}
// Step 3: Remap the edges of each graph using MapEdges
std::vector<HeteroGraphPtr> new_graphs;
for (size_t i = 0; i < graphs.size(); ++i) {
const HeteroGraphPtr curr_graph = graphs[i];
const auto meta_graph = curr_graph->meta_graph();
const int64_t num_etypes = curr_graph->NumEdgeTypes();
std::vector<HeteroGraphPtr> rel_graphs;
rel_graphs.reserve(num_etypes);
std::vector<IdArray> new_src;
std::vector<IdArray> new_dst;
std::tie(new_src, new_dst) = MapEdges(
curr_graph, all_edges[i], node_maps, stream);
for (IdType etype = 0; etype < num_etypes; ++etype) {
IdType srctype, dsttype;
std::tie(srctype, dsttype) = curr_graph->GetEndpointTypes(etype);
rel_graphs.push_back(UnitGraph::CreateFromCOO(
srctype == dsttype ? 1 : 2,
induced_nodes[srctype]->shape[0],
induced_nodes[dsttype]->shape[0],
new_src[etype],
new_dst[etype]));
}
new_graphs.push_back(CreateHeteroGraph(meta_graph, rel_graphs, num_induced_nodes));
}
return std::make_pair(new_graphs, induced_nodes);
}
} // namespace
template<>
std::pair<std::vector<HeteroGraphPtr>, std::vector<IdArray>>
CompactGraphs<kDLGPU, int32_t>(
const std::vector<HeteroGraphPtr> &graphs,
const std::vector<IdArray> &always_preserve) {
return CompactGraphsGPU<int32_t>(graphs, always_preserve);
}
template<>
std::pair<std::vector<HeteroGraphPtr>, std::vector<IdArray>>
CompactGraphs<kDLGPU, int64_t>(
const std::vector<HeteroGraphPtr> &graphs,
const std::vector<IdArray> &always_preserve) {
return CompactGraphsGPU<int64_t>(graphs, always_preserve);
}
} // namespace transform
} // namespace dgl
/*!
* Copyright 2020-2021 Contributors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* \file graph/transform/cuda/cuda_map_edges.cuh
* \brief Device level functions for mapping edges.
*/
#ifndef DGL_GRAPH_TRANSFORM_CUDA_CUDA_MAP_EDGES_CUH_
#define DGL_GRAPH_TRANSFORM_CUDA_CUDA_MAP_EDGES_CUH_
#include <dgl/runtime/c_runtime_api.h>
#include <cuda_runtime.h>
#include "../../../runtime/cuda/cuda_common.h"
#include "../../../runtime/cuda/cuda_hashtable.cuh"
using namespace dgl::aten;
using namespace dgl::runtime::cuda;
namespace dgl {
namespace transform {
namespace cuda {
template<typename IdType, int BLOCK_SIZE, IdType TILE_SIZE>
__device__ void map_vertex_ids(
const IdType * const global,
IdType * const new_global,
const IdType num_vertices,
const DeviceOrderedHashTable<IdType>& table) {
assert(BLOCK_SIZE == blockDim.x);
using Mapping = typename OrderedHashTable<IdType>::Mapping;
const IdType tile_start = TILE_SIZE*blockIdx.x;
const IdType tile_end = min(TILE_SIZE*(blockIdx.x+1), num_vertices);
for (IdType idx = threadIdx.x+tile_start; idx < tile_end; idx+=BLOCK_SIZE) {
const Mapping& mapping = *table.Search(global[idx]);
new_global[idx] = mapping.local;
}
}
/**
* \brief Generate mapped edge endpoint ids.
*
* \tparam IdType The type of id.
* \tparam BLOCK_SIZE The size of each thread block.
* \tparam TILE_SIZE The number of edges to process per thread block.
* \param global_srcs_device The source ids to map.
* \param new_global_srcs_device The mapped source ids (output).
* \param global_dsts_device The destination ids to map.
* \param new_global_dsts_device The mapped destination ids (output).
* \param num_edges The number of edges to map.
* \param src_mapping The mapping of sources ids.
* \param src_hash_size The the size of source id hash table/mapping.
* \param dst_mapping The mapping of destination ids.
* \param dst_hash_size The the size of destination id hash table/mapping.
*/
template<typename IdType, int BLOCK_SIZE, IdType TILE_SIZE>
__global__ void map_edge_ids(
const IdType * const global_srcs_device,
IdType * const new_global_srcs_device,
const IdType * const global_dsts_device,
IdType * const new_global_dsts_device,
const IdType num_edges,
DeviceOrderedHashTable<IdType> src_mapping,
DeviceOrderedHashTable<IdType> dst_mapping) {
assert(BLOCK_SIZE == blockDim.x);
assert(2 == gridDim.y);
if (blockIdx.y == 0) {
map_vertex_ids<IdType, BLOCK_SIZE, TILE_SIZE>(
global_srcs_device,
new_global_srcs_device,
num_edges,
src_mapping);
} else {
map_vertex_ids<IdType, BLOCK_SIZE, TILE_SIZE>(
global_dsts_device,
new_global_dsts_device,
num_edges,
dst_mapping);
}
}
/**
* \brief Device level node maps for each node type.
*
* \param num_nodes Number of nodes per type.
* \param offset When offset is set to 0, LhsHashTable is identical to RhsHashTable.
* Or set to num_nodes.size()/2 to use seperated LhsHashTable and RhsHashTable.
* \param ctx The DGL context.
* \param stream The stream to operate on.
*/
template<typename IdType>
class DeviceNodeMap {
public:
using Mapping = typename OrderedHashTable<IdType>::Mapping;
DeviceNodeMap(
const std::vector<int64_t>& num_nodes,
const int64_t offset,
DGLContext ctx,
cudaStream_t stream) :
num_types_(num_nodes.size()),
rhs_offset_(offset),
workspaces_(),
hash_tables_(),
ctx_(ctx) {
auto device = runtime::DeviceAPI::Get(ctx);
hash_tables_.reserve(num_types_);
workspaces_.reserve(num_types_);
for (int64_t i = 0; i < num_types_; ++i) {
hash_tables_.emplace_back(
new OrderedHashTable<IdType>(
num_nodes[i],
ctx_,
stream));
}
}
OrderedHashTable<IdType>& LhsHashTable(
const size_t index) {
return HashData(index);
}
OrderedHashTable<IdType>& RhsHashTable(
const size_t index) {
return HashData(index+rhs_offset_);
}
const OrderedHashTable<IdType>& LhsHashTable(
const size_t index) const {
return HashData(index);
}
const OrderedHashTable<IdType>& RhsHashTable(
const size_t index) const {
return HashData(index+rhs_offset_);
}
IdType LhsHashSize(
const size_t index) const {
return HashSize(index);
}
IdType RhsHashSize(
const size_t index) const {
return HashSize(rhs_offset_+index);
}
size_t Size() const {
return hash_tables_.size();
}
private:
int64_t num_types_;
size_t rhs_offset_;
std::vector<void*> workspaces_;
std::vector<std::unique_ptr<OrderedHashTable<IdType>>> hash_tables_;
DGLContext ctx_;
inline OrderedHashTable<IdType>& HashData(
const size_t index) {
CHECK_LT(index, hash_tables_.size());
return *hash_tables_[index];
}
inline const OrderedHashTable<IdType>& HashData(
const size_t index) const {
CHECK_LT(index, hash_tables_.size());
return *hash_tables_[index];
}
inline IdType HashSize(
const size_t index) const {
return HashData(index).size();
}
};
template<typename IdType>
inline size_t RoundUpDiv(
const IdType num,
const size_t divisor) {
return static_cast<IdType>(num/divisor) + (num % divisor == 0 ? 0 : 1);
}
template<typename IdType>
inline IdType RoundUp(
const IdType num,
const size_t unit) {
return RoundUpDiv(num, unit)*unit;
}
template<typename IdType>
std::tuple<std::vector<IdArray>, std::vector<IdArray>>
MapEdges(
HeteroGraphPtr graph,
const std::vector<EdgeArray>& edge_sets,
const DeviceNodeMap<IdType>& node_map,
cudaStream_t stream) {
constexpr const int BLOCK_SIZE = 128;
constexpr const size_t TILE_SIZE = 1024;
const auto& ctx = graph->Context();
std::vector<IdArray> new_lhs;
new_lhs.reserve(edge_sets.size());
std::vector<IdArray> new_rhs;
new_rhs.reserve(edge_sets.size());
// The next peformance optimization here, is to perform mapping of all edge
// types in a single kernel launch.
const int64_t num_edge_sets = static_cast<int64_t>(edge_sets.size());
for (int64_t etype = 0; etype < num_edge_sets; ++etype) {
const EdgeArray& edges = edge_sets[etype];
if (edges.id.defined() && edges.src->shape[0] > 0) {
const int64_t num_edges = edges.src->shape[0];
new_lhs.emplace_back(NewIdArray(num_edges, ctx, sizeof(IdType)*8));
new_rhs.emplace_back(NewIdArray(num_edges, ctx, sizeof(IdType)*8));
const auto src_dst_types = graph->GetEndpointTypes(etype);
const int src_type = src_dst_types.first;
const int dst_type = src_dst_types.second;
const dim3 grid(RoundUpDiv(num_edges, TILE_SIZE), 2);
const dim3 block(BLOCK_SIZE);
// map the srcs
map_edge_ids<IdType, BLOCK_SIZE, TILE_SIZE><<<
grid,
block,
0,
stream>>>(
edges.src.Ptr<IdType>(),
new_lhs.back().Ptr<IdType>(),
edges.dst.Ptr<IdType>(),
new_rhs.back().Ptr<IdType>(),
num_edges,
node_map.LhsHashTable(src_type).DeviceHandle(),
node_map.RhsHashTable(dst_type).DeviceHandle());
CUDA_CALL(cudaGetLastError());
} else {
new_lhs.emplace_back(
aten::NullArray(DLDataType{kDLInt, sizeof(IdType)*8, 1}, ctx));
new_rhs.emplace_back(
aten::NullArray(DLDataType{kDLInt, sizeof(IdType)*8, 1}, ctx));
}
}
return std::tuple<std::vector<IdArray>, std::vector<IdArray>>(
std::move(new_lhs), std::move(new_rhs));
}
} // namespace cuda
} // namespace transform
} // namespace dgl
#endif
......@@ -13,7 +13,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*
* \file graph/transform/cuda_to_block.cu
* \file graph/transform/cuda/cuda_to_block.cu
* \brief Functions to convert a set of edges into a graph block with local
* ids.
*/
......@@ -27,182 +27,19 @@
#include <memory>
#include "../../../runtime/cuda/cuda_common.h"
#include "../../../runtime/cuda/cuda_hashtable.cuh"
#include "../../heterograph.h"
#include "../to_bipartite.h"
#include "cuda_map_edges.cuh"
using namespace dgl::aten;
using namespace dgl::runtime::cuda;
using namespace dgl::transform::cuda;
namespace dgl {
namespace transform {
namespace {
template<typename IdType, int BLOCK_SIZE, IdType TILE_SIZE>
__device__ void map_vertex_ids(
const IdType * const global,
IdType * const new_global,
const IdType num_vertices,
const DeviceOrderedHashTable<IdType>& table) {
assert(BLOCK_SIZE == blockDim.x);
using Mapping = typename OrderedHashTable<IdType>::Mapping;
const IdType tile_start = TILE_SIZE*blockIdx.x;
const IdType tile_end = min(TILE_SIZE*(blockIdx.x+1), num_vertices);
for (IdType idx = threadIdx.x+tile_start; idx < tile_end; idx+=BLOCK_SIZE) {
const Mapping& mapping = *table.Search(global[idx]);
new_global[idx] = mapping.local;
}
}
/**
* \brief Generate mapped edge endpoint ids.
*
* \tparam IdType The type of id.
* \tparam BLOCK_SIZE The size of each thread block.
* \tparam TILE_SIZE The number of edges to process per thread block.
* \param global_srcs_device The source ids to map.
* \param new_global_srcs_device The mapped source ids (output).
* \param global_dsts_device The destination ids to map.
* \param new_global_dsts_device The mapped destination ids (output).
* \param num_edges The number of edges to map.
* \param src_mapping The mapping of sources ids.
* \param src_hash_size The the size of source id hash table/mapping.
* \param dst_mapping The mapping of destination ids.
* \param dst_hash_size The the size of destination id hash table/mapping.
*/
template<typename IdType, int BLOCK_SIZE, IdType TILE_SIZE>
__global__ void map_edge_ids(
const IdType * const global_srcs_device,
IdType * const new_global_srcs_device,
const IdType * const global_dsts_device,
IdType * const new_global_dsts_device,
const IdType num_edges,
DeviceOrderedHashTable<IdType> src_mapping,
DeviceOrderedHashTable<IdType> dst_mapping) {
assert(BLOCK_SIZE == blockDim.x);
assert(2 == gridDim.y);
if (blockIdx.y == 0) {
map_vertex_ids<IdType, BLOCK_SIZE, TILE_SIZE>(
global_srcs_device,
new_global_srcs_device,
num_edges,
src_mapping);
} else {
map_vertex_ids<IdType, BLOCK_SIZE, TILE_SIZE>(
global_dsts_device,
new_global_dsts_device,
num_edges,
dst_mapping);
}
}
template<typename IdType>
inline size_t RoundUpDiv(
const IdType num,
const size_t divisor) {
return static_cast<IdType>(num/divisor) + (num % divisor == 0 ? 0 : 1);
}
template<typename IdType>
inline IdType RoundUp(
const IdType num,
const size_t unit) {
return RoundUpDiv(num, unit)*unit;
}
template<typename IdType>
class DeviceNodeMap {
public:
using Mapping = typename OrderedHashTable<IdType>::Mapping;
DeviceNodeMap(
const std::vector<int64_t>& num_nodes,
DGLContext ctx,
cudaStream_t stream) :
num_types_(num_nodes.size()),
rhs_offset_(num_types_/2),
workspaces_(),
hash_tables_(),
ctx_(ctx) {
auto device = runtime::DeviceAPI::Get(ctx);
hash_tables_.reserve(num_types_);
workspaces_.reserve(num_types_);
for (int64_t i = 0; i < num_types_; ++i) {
hash_tables_.emplace_back(
new OrderedHashTable<IdType>(
num_nodes[i],
ctx_,
stream));
}
}
OrderedHashTable<IdType>& LhsHashTable(
const size_t index) {
return HashData(index);
}
OrderedHashTable<IdType>& RhsHashTable(
const size_t index) {
return HashData(index+rhs_offset_);
}
const OrderedHashTable<IdType>& LhsHashTable(
const size_t index) const {
return HashData(index);
}
const OrderedHashTable<IdType>& RhsHashTable(
const size_t index) const {
return HashData(index+rhs_offset_);
}
IdType LhsHashSize(
const size_t index) const {
return HashSize(index);
}
IdType RhsHashSize(
const size_t index) const {
return HashSize(rhs_offset_+index);
}
size_t Size() const {
return hash_tables_.size();
}
private:
int64_t num_types_;
size_t rhs_offset_;
std::vector<void*> workspaces_;
std::vector<std::unique_ptr<OrderedHashTable<IdType>>> hash_tables_;
DGLContext ctx_;
inline OrderedHashTable<IdType>& HashData(
const size_t index) {
CHECK_LT(index, hash_tables_.size());
return *hash_tables_[index];
}
inline const OrderedHashTable<IdType>& HashData(
const size_t index) const {
CHECK_LT(index, hash_tables_.size());
return *hash_tables_[index];
}
inline IdType HashSize(
const size_t index) const {
return HashData(index).size();
}
};
template<typename IdType>
class DeviceNodeMapMaker {
public:
......@@ -315,67 +152,6 @@ class DeviceNodeMapMaker {
IdType max_num_nodes_;
};
template<typename IdType>
std::tuple<std::vector<IdArray>, std::vector<IdArray>>
MapEdges(
HeteroGraphPtr graph,
const std::vector<EdgeArray>& edge_sets,
const DeviceNodeMap<IdType>& node_map,
cudaStream_t stream) {
constexpr const int BLOCK_SIZE = 128;
constexpr const size_t TILE_SIZE = 1024;
const auto& ctx = graph->Context();
std::vector<IdArray> new_lhs;
new_lhs.reserve(edge_sets.size());
std::vector<IdArray> new_rhs;
new_rhs.reserve(edge_sets.size());
// The next peformance optimization here, is to perform mapping of all edge
// types in a single kernel launch.
const int64_t num_edge_sets = static_cast<int64_t>(edge_sets.size());
for (int64_t etype = 0; etype < num_edge_sets; ++etype) {
const EdgeArray& edges = edge_sets[etype];
if (edges.id.defined() && edges.src->shape[0] > 0) {
const int64_t num_edges = edges.src->shape[0];
new_lhs.emplace_back(NewIdArray(num_edges, ctx, sizeof(IdType)*8));
new_rhs.emplace_back(NewIdArray(num_edges, ctx, sizeof(IdType)*8));
const auto src_dst_types = graph->GetEndpointTypes(etype);
const int src_type = src_dst_types.first;
const int dst_type = src_dst_types.second;
const dim3 grid(RoundUpDiv(num_edges, TILE_SIZE), 2);
const dim3 block(BLOCK_SIZE);
// map the srcs
map_edge_ids<IdType, BLOCK_SIZE, TILE_SIZE><<<
grid,
block,
0,
stream>>>(
edges.src.Ptr<IdType>(),
new_lhs.back().Ptr<IdType>(),
edges.dst.Ptr<IdType>(),
new_rhs.back().Ptr<IdType>(),
num_edges,
node_map.LhsHashTable(src_type).DeviceHandle(),
node_map.RhsHashTable(dst_type).DeviceHandle());
CUDA_CALL(cudaGetLastError());
} else {
new_lhs.emplace_back(
aten::NullArray(DLDataType{kDLInt, sizeof(IdType)*8, 1}, ctx));
new_rhs.emplace_back(
aten::NullArray(DLDataType{kDLInt, sizeof(IdType)*8, 1}, ctx));
}
}
return std::tuple<std::vector<IdArray>, std::vector<IdArray>>(
std::move(new_lhs), std::move(new_rhs));
}
// Since partial specialization is not allowed for functions, use this as an
// intermediate for ToBlock where XPU = kDLGPU.
......@@ -487,7 +263,7 @@ ToBlockGPU(
// allocate space for map creation process
DeviceNodeMapMaker<IdType> maker(maxNodesPerType);
DeviceNodeMap<IdType> node_maps(maxNodesPerType, ctx, stream);
DeviceNodeMap<IdType> node_maps(maxNodesPerType, num_ntypes, ctx, stream);
if (generate_lhs_nodes) {
lhs_nodes.reserve(num_ntypes);
......
/*!
* Copyright (c) 2020 by Contributors
* \file graph/transform/cuda_to_block.h
* \brief Functions to convert a set of edges into a graph block with local
* ids.
*/
#ifndef DGL_GRAPH_TRANSFORM_CUDA_CUDA_TO_BLOCK_H_
#define DGL_GRAPH_TRANSFORM_CUDA_CUDA_TO_BLOCK_H_
#include <dgl/array.h>
#include <dgl/base_heterograph.h>
#include <vector>
#include <tuple>
namespace dgl {
namespace transform {
namespace cuda {
/**
* @brief Generate a subgraph with locally numbered vertices, from the given
* edge set.
*
* @param graph The set of edges to construct the subgraph from.
* @param rhs_nodes The unique set of destination vertices.
* @param include_rhs_in_lhs Whether or not to include the `rhs_nodes` in the
* set of source vertices for purposes of local numbering.
*
* @return The subgraph, the unique set of source nodes, and the mapping of
* subgraph edges to global edges.
*/
std::tuple<HeteroGraphPtr, std::vector<IdArray>, std::vector<IdArray>>
CudaToBlock(
HeteroGraphPtr graph,
const std::vector<IdArray>& rhs_nodes,
const bool include_rhs_in_lhs);
} // namespace cuda
} // namespace transform
} // namespace dgl
#endif // DGL_GRAPH_TRANSFORM_CUDA_CUDA_TO_BLOCK_H_
......@@ -13,7 +13,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*
* \file graph/transform/cuda_to_block.cu
* \file graph/transform/to_bipartite.h
* \brief Functions to convert a set of edges into a graph block with local
* ids.
*/
......
......@@ -662,24 +662,23 @@ def test_reorder_nodes():
old_neighs2 = g.predecessors(old_nid)
assert np.all(np.sort(old_neighs1) == np.sort(F.asnumpy(old_neighs2)))
@unittest.skipIf(F._default_context_str == 'gpu', reason="GPU compaction not implemented")
@parametrize_dtype
def test_compact(idtype):
g1 = dgl.heterograph({
('user', 'follow', 'user'): ([1, 3], [3, 5]),
('user', 'plays', 'game'): ([2, 3, 2], [4, 4, 5]),
('game', 'wished-by', 'user'): ([6, 5], [7, 7])},
{'user': 20, 'game': 10}, idtype=idtype)
{'user': 20, 'game': 10}, idtype=idtype, device=F.ctx())
g2 = dgl.heterograph({
('game', 'clicked-by', 'user'): ([3], [1]),
('user', 'likes', 'user'): ([1, 8], [8, 9])},
{'user': 20, 'game': 10}, idtype=idtype)
{'user': 20, 'game': 10}, idtype=idtype, device=F.ctx())
g3 = dgl.heterograph({('user', '_E', 'user'): ((0, 1), (1, 2))},
{'user': 10}, idtype=idtype)
{'user': 10}, idtype=idtype, device=F.ctx())
g4 = dgl.heterograph({('user', '_E', 'user'): ((1, 3), (3, 5))},
{'user': 10}, idtype=idtype)
{'user': 10}, idtype=idtype, device=F.ctx())
def _check(g, new_g, induced_nodes):
assert g.ntypes == new_g.ntypes
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment