[Sampling] Implement dgl.compact_graphs() for the GPU (#3423)

* gpu compact graph template * cuda compact graph draft * fix typo * compact graphs * pass unit test but fail in training * example using EdgeDataLoader on the GPU * refactor cuda_compact_graph and cuda_to_block * update training scripts * fix linting * fix linting * fix exclude_edges for the GPU * add --data-cpu & fix copyright

[Sampling] Implement dgl.compact_graphs() for the GPU (#3423)
* gpu compact graph template * cuda compact graph draft * fix typo * compact graphs * pass unit test but fail in training * example using EdgeDataLoader on the GPU * refactor cuda_compact_graph and cuda_to_block * update training scripts * fix linting * fix linting * fix exclude_edges for the GPU * add --data-cpu & fix copyright
a8c81018 · Xin Yao · GitHub · 308e52a3 · a8c81018 · a8c81018
Unverified Commit a8c81018 authored Oct 21, 2021 by Xin Yao Committed by GitHub Oct 20, 2021
11 changed files
--- a/examples/pytorch/graphsage/train_sampling_unsupervised.py
+++ b/examples/pytorch/graphsage/train_sampling_unsupervised.py
@@ -67,6 +67,9 @@ def run(proc_id, n_gpus, args, devices, data):
    train_mask, val_mask, test_mask, n_classes, g = data
    nfeat = g.ndata.pop('feat')
    labels = g.ndata.pop('label')
+    if not args.data_cpu:
+        nfeat = nfeat.to(device)
+        labels = labels.to(device)
    in_feats = nfeat.shape[1]
    train_nid = th.LongTensor(np.nonzero(train_mask)).squeeze()
@@ -77,6 +80,11 @@ def run(proc_id, n_gpus, args, devices, data):
    n_edges = g.num_edges()
    train_seeds = th.arange(n_edges)
+    if args.sample_gpu:
+        assert n_gpus > 0, "Must have GPUs to enable GPU sampling"
+        train_seeds = train_seeds.to(device)
+        g = g.to(device)
    # Create sampler
    sampler = dgl.dataloading.MultiLayerNeighborSampler(
        [int(fanout) for fanout in args.fan_out.split(',')])
@@ -121,11 +129,11 @@ def run(proc_id, n_gpus, args, devices, data):
        tic_step = time.time()
        for step, (input_nodes, pos_graph, neg_graph, blocks) in enumerate(dataloader):
            batch_inputs = nfeat[input_nodes].to(device)
-            d_step = time.time()
            pos_graph = pos_graph.to(device)
            neg_graph = neg_graph.to(device)
            blocks = [block.int().to(device) for block in blocks]
+            d_step = time.time()
            # Compute loss and prediction
            batch_pred = model(blocks, batch_inputs)
            loss = loss_fcn(batch_pred, pos_graph, neg_graph)
@@ -213,6 +221,13 @@ if __name__ == '__main__':
    argparser.add_argument('--dropout', type=float, default=0.5)
    argparser.add_argument('--num-workers', type=int, default=0,
                           help="Number of sampling processes. Use 0 for no extra process.")
+    argparser.add_argument('--sample-gpu', action='store_true',
+                           help="Perform the sampling process on the GPU. Must have 0 workers.")
+    argparser.add_argument('--data-cpu', action='store_true',
+                           help="By default the script puts all node features and labels "
+                                "on GPU when using it to save time for data copy. This may "
+                                "be undesired if they cannot fit in GPU memory at once. "
+                                "This flag disables that.")
    args = argparser.parse_args()
    devices = list(map(int, args.gpu.split(',')))

--- a/python/dgl/dataloading/dataloader.py
+++ b/python/dgl/dataloading/dataloader.py
@@ -368,7 +368,7 @@ class BlockSampler(object):
            else:
                seed_nodes_in = seed_nodes_in.to(graph_device)
-            if self.exclude_edges_in_frontier:
+            if self.exclude_edges_in_frontier(g):
                frontier = self.sample_frontier(
                    block_id, g, seed_nodes_in, exclude_eids=exclude_eids)
            else:

--- a/python/dgl/transform.py
+++ b/python/dgl/transform.py
@@ -1913,7 +1913,7 @@ def compact_graphs(graphs, always_preserve=None, copy_ndata=True, copy_edata=Tru
    graphs : DGLGraph or list[DGLGraph]
        The graph, or list of graphs.
-        All graphs must be on CPU.
+        All graphs must be on the same devices.
        All graphs must have the same set of nodes.
    always_preserve : Tensor or dict[str, Tensor], optional
@@ -2013,7 +2013,6 @@ def compact_graphs(graphs, always_preserve=None, copy_ndata=True, copy_edata=Tru
        return []
    if graphs[0].is_block:
        raise DGLError('Compacting a block graph is not allowed.')
-    assert all(g.device == F.cpu() for g in graphs), 'all the graphs must be on CPU'
    # Ensure the node types are ordered the same.
    # TODO(BarclayII): we ideally need to remove this constraint.
@@ -2026,8 +2025,8 @@ def compact_graphs(graphs, always_preserve=None, copy_ndata=True, copy_edata=Tru
             ntypes, g.ntypes)
        assert idtype == g.idtype, "Expect graph data type to be {}, but got {}".format(
            idtype, g.idtype)
-        assert device == g.device, "Expect graph device to be {}, but got {}".format(
+        assert device == g.device, "All graphs must be on the same devices." \
-            device, g.device)
+            "Expect graph device to be {}, but got {}".format(device, g.device)
    # Process the dictionary or tensor of "always preserve" nodes
    if always_preserve is None:

--- a/src/graph/transform/compact.cc
+++ b/src/graph/transform/compact.cc
 /*!
- *  Copyright (c) 2019 by Contributors
+ *  Copyright 2019-2021 Contributors
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
 * \file graph/transform/compact.cc
 * \brief Compact graph implementation
 */
+#include "compact.h"
 #include <dgl/base_heterograph.h>
 #include <dgl/transform.h>
 #include <dgl/array.h>
 #include <dgl/packed_func_ext.h>
+#include <dgl/runtime/registry.h>
+#include <dgl/runtime/container.h>
 #include <vector>
 #include <utility>
 #include "../../c_api_common.h"
@@ -27,7 +44,7 @@ namespace {
 template<typename IdType>
 std::pair<std::vector<HeteroGraphPtr>, std::vector<IdArray>>
-CompactGraphs(
+CompactGraphsCPU(
    const std::vector<HeteroGraphPtr> &graphs,
    const std::vector<IdArray> &always_preserve) {
  // TODO(BarclayII): check whether the node space and metagraph of each graph is the same.
@@ -121,17 +138,20 @@ CompactGraphs(
 };  // namespace
+template<>
 std::pair<std::vector<HeteroGraphPtr>, std::vector<IdArray>>
-CompactGraphs(
+CompactGraphs<kDLCPU, int32_t>(
    const std::vector<HeteroGraphPtr> &graphs,
    const std::vector<IdArray> &always_preserve) {
-  std::pair<std::vector<HeteroGraphPtr>, std::vector<IdArray>> result;
+  return CompactGraphsCPU<int32_t>(graphs, always_preserve);
-  // TODO(BarclayII): check for all IdArrays
+}
-  CHECK(graphs[0]->DataType() == always_preserve[0]->dtype) << "data type mismatch.";
-  ATEN_ID_TYPE_SWITCH(graphs[0]->DataType(), IdType, {
+template<>
-    result = CompactGraphs<IdType>(graphs, always_preserve);
+std::pair<std::vector<HeteroGraphPtr>, std::vector<IdArray>>
-  });
+CompactGraphs<kDLCPU, int64_t>(
-  return result;
+    const std::vector<HeteroGraphPtr> &graphs,
+    const std::vector<IdArray> &always_preserve) {
+  return CompactGraphsCPU<int64_t>(graphs, always_preserve);
 }
 DGL_REGISTER_GLOBAL("transform._CAPI_DGLCompactGraphs")
@@ -146,7 +166,17 @@ DGL_REGISTER_GLOBAL("transform._CAPI_DGLCompactGraphs")
    for (Value array : always_preserve_refs)
      always_preserve.push_back(array->data);
-    const auto &result_pair = CompactGraphs(graphs, always_preserve);
+    // TODO(BarclayII): check for all IdArrays
+    CHECK(graphs[0]->DataType() == always_preserve[0]->dtype) << "data type mismatch.";
+    std::pair<std::vector<HeteroGraphPtr>, std::vector<IdArray>> result_pair;
+    ATEN_XPU_SWITCH_CUDA(graphs[0]->Context().device_type, XPU, "CompactGraphs", {
+      ATEN_ID_TYPE_SWITCH(graphs[0]->DataType(), IdType, {
+        result_pair = CompactGraphs<XPU, IdType>(
+          graphs, always_preserve);
+      });
+    });
    List<HeteroGraphRef> compacted_graph_refs;
    List<Value> induced_nodes;

--- a/src/graph/transform/compact.h
+++ b/src/graph/transform/compact.h
+/*!
+ *  Copyright 2021 Contributors
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ * \file graph/transform/compact.h
+ * \brief Functions to find and eliminate the common isolated nodes across
+ * all given graphs with the same set of nodes.
+ */
+#ifndef DGL_GRAPH_TRANSFORM_COMPACT_H_
+#define DGL_GRAPH_TRANSFORM_COMPACT_H_
+#include <dgl/array.h>
+#include <dgl/base_heterograph.h>
+#include <vector>
+#include <utility>
+namespace dgl {
+namespace transform {
+/**
+ * @brief Given a list of graphs with the same set of nodes, find and eliminate
+ * the common isolated nodes across all graphs.
+ *
+ * @tparam XPU The type of device to operate on.
+ * @tparam IdType The type to use as an index.
+ * @param graphs The list of graphs to be compacted.
+ * @param always_preserve The vector of nodes to be preserved.
+ *
+ * @return The vector of compacted graphs and the vector of induced nodes.
+ */
+template<DLDeviceType XPU, typename IdType>
+std::pair<std::vector<HeteroGraphPtr>, std::vector<IdArray>>
+CompactGraphs(
+    const std::vector<HeteroGraphPtr> &graphs,
+    const std::vector<IdArray> &always_preserve);
+}  // namespace transform
+}  // namespace dgl
+#endif  // DGL_GRAPH_TRANSFORM_COMPACT_H_
--- a/src/graph/transform/cuda/cuda_compact_graph.cu
+++ b/src/graph/transform/cuda/cuda_compact_graph.cu
+/*!
+ *  Copyright 2021 Contributors
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ * \file graph/transform/cuda/cuda_compact_graph.cu
+ * \brief Functions to find and eliminate the common isolated nodes across
+ * all given graphs with the same set of nodes.
+ */
+#include <dgl/runtime/device_api.h>
+#include <dgl/immutable_graph.h>
+#include <cuda_runtime.h>
+#include <utility>
+#include <algorithm>
+#include <memory>
+#include "../../../runtime/cuda/cuda_common.h"
+#include "../../heterograph.h"
+#include "../compact.h"
+#include "cuda_map_edges.cuh"
+using namespace dgl::aten;
+using namespace dgl::runtime::cuda;
+using namespace dgl::transform::cuda;
+namespace dgl {
+namespace transform {
+namespace {
+/**
+* \brief This function builds node maps for each node type, preserving the
+* order of the input nodes. Here it is assumed the nodes are not unique,
+* and thus a unique list is generated.
+*
+* \param input_nodes The set of input nodes.
+* \param node_maps The node maps to be constructed.
+* \param count_unique_device The number of unique nodes (on the GPU).
+* \param unique_nodes_device The unique nodes (on the GPU).
+* \param stream The stream to operate on.
+*/
+template<typename IdType>
+void BuildNodeMaps(
+    const std::vector<IdArray>& input_nodes,
+    DeviceNodeMap<IdType> * const node_maps,
+    int64_t * const count_unique_device,
+    std::vector<IdArray>* const unique_nodes_device,
+    cudaStream_t stream) {
+  const int64_t num_ntypes = static_cast<int64_t>(input_nodes.size());
+  CUDA_CALL(cudaMemsetAsync(
+    count_unique_device,
+    0,
+    num_ntypes*sizeof(*count_unique_device),
+    stream));
+  // possibly duplicated nodes
+  for (int64_t ntype = 0; ntype < num_ntypes; ++ntype) {
+    const IdArray& nodes = input_nodes[ntype];
+    if (nodes->shape[0] > 0) {
+      CHECK_EQ(nodes->ctx.device_type, kDLGPU);
+      node_maps->LhsHashTable(ntype).FillWithDuplicates(
+          nodes.Ptr<IdType>(),
+          nodes->shape[0],
+          (*unique_nodes_device)[ntype].Ptr<IdType>(),
+          count_unique_device+ntype,
+          stream);
+    }
+  }
+}
+template<typename IdType>
+std::pair<std::vector<HeteroGraphPtr>, std::vector<IdArray>>
+CompactGraphsGPU(
+    const std::vector<HeteroGraphPtr> &graphs,
+    const std::vector<IdArray> &always_preserve) {
+  cudaStream_t stream = 0;
+  const auto& ctx = graphs[0]->Context();
+  auto device = runtime::DeviceAPI::Get(ctx);
+  CHECK_EQ(ctx.device_type, kDLGPU);
+  // Step 1: Collect the nodes that has connections for each type.
+  const int64_t num_ntypes = graphs[0]->NumVertexTypes();
+  std::vector<std::vector<EdgeArray>> all_edges(graphs.size());   // all_edges[i][etype]
+  // count the number of nodes per type
+  std::vector<int64_t> max_vertex_cnt(num_ntypes, 0);
+  for (size_t i = 0; i < graphs.size(); ++i) {
+    const HeteroGraphPtr curr_graph = graphs[i];
+    const int64_t num_etypes = curr_graph->NumEdgeTypes();
+    for (IdType etype = 0; etype < num_etypes; ++etype) {
+      IdType srctype, dsttype;
+      std::tie(srctype, dsttype) = curr_graph->GetEndpointTypes(etype);
+      const int64_t n_edges = curr_graph->NumEdges(etype);
+      max_vertex_cnt[srctype] += n_edges;
+      max_vertex_cnt[dsttype] += n_edges;
+    }
+  }
+  for (size_t i = 0; i < always_preserve.size(); ++i) {
+    max_vertex_cnt[i] += always_preserve[i]->shape[0];
+  }
+  // gather all nodes
+  std::vector<IdArray> all_nodes(num_ntypes);
+  std::vector<int64_t> node_offsets(num_ntypes, 0);
+  for (int64_t ntype = 0; ntype < num_ntypes; ++ntype) {
+    all_nodes[ntype] = NewIdArray(max_vertex_cnt[ntype], ctx,
+      sizeof(IdType)*8);
+    // copy the nodes in always_preserve
+    if (ntype < always_preserve.size() && always_preserve[ntype]->shape[0] > 0) {
+      device->CopyDataFromTo(
+          always_preserve[ntype].Ptr<IdType>(), 0,
+          all_nodes[ntype].Ptr<IdType>(),
+          node_offsets[ntype],
+          sizeof(IdType)*always_preserve[ntype]->shape[0],
+          always_preserve[ntype]->ctx,
+          all_nodes[ntype]->ctx,
+          always_preserve[ntype]->dtype,
+          stream);
+      node_offsets[ntype] += sizeof(IdType)*always_preserve[ntype]->shape[0];
+    }
+  }
+  for (size_t i = 0; i < graphs.size(); ++i) {
+    const HeteroGraphPtr curr_graph = graphs[i];
+    const int64_t num_etypes = curr_graph->NumEdgeTypes();
+    all_edges[i].reserve(num_etypes);
+    for (int64_t etype = 0; etype < num_etypes; ++etype) {
+      dgl_type_t srctype, dsttype;
+      std::tie(srctype, dsttype) = curr_graph->GetEndpointTypes(etype);
+      const EdgeArray edges = curr_graph->Edges(etype, "eid");
+      if (edges.src.defined()) {
+        device->CopyDataFromTo(
+            edges.src.Ptr<IdType>(), 0,
+            all_nodes[srctype].Ptr<IdType>(),
+            node_offsets[srctype],
+            sizeof(IdType)*edges.src->shape[0],
+            edges.src->ctx,
+            all_nodes[srctype]->ctx,
+            edges.src->dtype,
+            stream);
+        node_offsets[srctype] += sizeof(IdType)*edges.src->shape[0];
+      }
+      if (edges.dst.defined()) {
+        device->CopyDataFromTo(
+            edges.dst.Ptr<IdType>(), 0,
+            all_nodes[dsttype].Ptr<IdType>(),
+            node_offsets[dsttype],
+            sizeof(IdType)*edges.dst->shape[0],
+            edges.dst->ctx,
+            all_nodes[dsttype]->ctx,
+            edges.dst->dtype,
+            stream);
+        node_offsets[dsttype] += sizeof(IdType)*edges.dst->shape[0];
+      }
+      all_edges[i].push_back(edges);
+    }
+  }
+  // Step 2: Relabel the nodes for each type to a smaller ID space
+  //         using BuildNodeMaps
+  // allocate space for map creation
+  // the hashmap on GPU
+  DeviceNodeMap<IdType> node_maps(max_vertex_cnt, 0, ctx, stream);
+  // number of unique nodes per type on CPU
+  std::vector<int64_t> num_induced_nodes(num_ntypes);
+  // number of unique nodes per type on GPU
+  int64_t * count_unique_device = static_cast<int64_t*>(
+      device->AllocWorkspace(ctx, sizeof(int64_t)*num_ntypes));
+  // the set of unique nodes per type
+  std::vector<IdArray> induced_nodes(num_ntypes);
+  for (int64_t ntype = 0; ntype < num_ntypes; ++ntype) {
+    induced_nodes[ntype] = NewIdArray(max_vertex_cnt[ntype], ctx,
+      sizeof(IdType)*8);
+  }
+  BuildNodeMaps(
+    all_nodes,
+    &node_maps,
+    count_unique_device,
+    &induced_nodes,
+    stream);
+  device->CopyDataFromTo(
+    count_unique_device, 0,
+    num_induced_nodes.data(), 0,
+    sizeof(*num_induced_nodes.data())*num_ntypes,
+    ctx,
+    DGLContext{kDLCPU, 0},
+    DGLType{kDLInt, 64, 1},
+    stream);
+  device->StreamSync(ctx, stream);
+  // wait for the node counts to finish transferring
+  device->FreeWorkspace(ctx, count_unique_device);
+  // resize induced nodes
+  for (int64_t ntype = 0; ntype < num_ntypes; ++ntype) {
+    induced_nodes[ntype]->shape[0] = num_induced_nodes[ntype];
+  }
+  // Step 3: Remap the edges of each graph using MapEdges
+  std::vector<HeteroGraphPtr> new_graphs;
+  for (size_t i = 0; i < graphs.size(); ++i) {
+    const HeteroGraphPtr curr_graph = graphs[i];
+    const auto meta_graph = curr_graph->meta_graph();
+    const int64_t num_etypes = curr_graph->NumEdgeTypes();
+    std::vector<HeteroGraphPtr> rel_graphs;
+    rel_graphs.reserve(num_etypes);
+    std::vector<IdArray> new_src;
+    std::vector<IdArray> new_dst;
+    std::tie(new_src, new_dst) = MapEdges(
+      curr_graph, all_edges[i], node_maps, stream);
+    for (IdType etype = 0; etype < num_etypes; ++etype) {
+      IdType srctype, dsttype;
+      std::tie(srctype, dsttype) = curr_graph->GetEndpointTypes(etype);
+      rel_graphs.push_back(UnitGraph::CreateFromCOO(
+          srctype == dsttype ? 1 : 2,
+          induced_nodes[srctype]->shape[0],
+          induced_nodes[dsttype]->shape[0],
+          new_src[etype],
+          new_dst[etype]));
+    }
+    new_graphs.push_back(CreateHeteroGraph(meta_graph, rel_graphs, num_induced_nodes));
+  }
+  return std::make_pair(new_graphs, induced_nodes);
+}
+}  // namespace
+template<>
+std::pair<std::vector<HeteroGraphPtr>, std::vector<IdArray>>
+CompactGraphs<kDLGPU, int32_t>(
+    const std::vector<HeteroGraphPtr> &graphs,
+    const std::vector<IdArray> &always_preserve) {
+  return CompactGraphsGPU<int32_t>(graphs, always_preserve);
+}
+template<>
+std::pair<std::vector<HeteroGraphPtr>, std::vector<IdArray>>
+CompactGraphs<kDLGPU, int64_t>(
+    const std::vector<HeteroGraphPtr> &graphs,
+    const std::vector<IdArray> &always_preserve) {
+  return CompactGraphsGPU<int64_t>(graphs, always_preserve);
+}
+}  // namespace transform
+}  // namespace dgl
--- a/src/graph/transform/cuda/cuda_map_edges.cuh
+++ b/src/graph/transform/cuda/cuda_map_edges.cuh
+/*!
+ *  Copyright 2020-2021 Contributors
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ * \file graph/transform/cuda/cuda_map_edges.cuh
+ * \brief Device level functions for mapping edges.
+ */
+#ifndef DGL_GRAPH_TRANSFORM_CUDA_CUDA_MAP_EDGES_CUH_
+#define DGL_GRAPH_TRANSFORM_CUDA_CUDA_MAP_EDGES_CUH_
+#include <dgl/runtime/c_runtime_api.h>
+#include <cuda_runtime.h>
+#include "../../../runtime/cuda/cuda_common.h"
+#include "../../../runtime/cuda/cuda_hashtable.cuh"
+using namespace dgl::aten;
+using namespace dgl::runtime::cuda;
+namespace dgl {
+namespace transform {
+namespace cuda {
+template<typename IdType, int BLOCK_SIZE, IdType TILE_SIZE>
+__device__ void map_vertex_ids(
+    const IdType * const global,
+    IdType * const new_global,
+    const IdType num_vertices,
+    const DeviceOrderedHashTable<IdType>& table) {
+  assert(BLOCK_SIZE == blockDim.x);
+  using Mapping = typename OrderedHashTable<IdType>::Mapping;
+  const IdType tile_start = TILE_SIZE*blockIdx.x;
+  const IdType tile_end = min(TILE_SIZE*(blockIdx.x+1), num_vertices);
+  for (IdType idx = threadIdx.x+tile_start; idx < tile_end; idx+=BLOCK_SIZE) {
+    const Mapping& mapping = *table.Search(global[idx]);
+    new_global[idx] = mapping.local;
+  }
+}
+/**
+* \brief Generate mapped edge endpoint ids.
+*
+* \tparam IdType The type of id.
+* \tparam BLOCK_SIZE The size of each thread block.
+* \tparam TILE_SIZE The number of edges to process per thread block.
+* \param global_srcs_device The source ids to map.
+* \param new_global_srcs_device The mapped source ids (output).
+* \param global_dsts_device The destination ids to map.
+* \param new_global_dsts_device The mapped destination ids (output).
+* \param num_edges The number of edges to map.
+* \param src_mapping The mapping of sources ids.
+* \param src_hash_size The the size of source id hash table/mapping.
+* \param dst_mapping The mapping of destination ids.
+* \param dst_hash_size The the size of destination id hash table/mapping.
+*/
+template<typename IdType, int BLOCK_SIZE, IdType TILE_SIZE>
+__global__ void map_edge_ids(
+    const IdType * const global_srcs_device,
+    IdType * const new_global_srcs_device,
+    const IdType * const global_dsts_device,
+    IdType * const new_global_dsts_device,
+    const IdType num_edges,
+    DeviceOrderedHashTable<IdType> src_mapping,
+    DeviceOrderedHashTable<IdType> dst_mapping) {
+  assert(BLOCK_SIZE == blockDim.x);
+  assert(2 == gridDim.y);
+  if (blockIdx.y == 0) {
+    map_vertex_ids<IdType, BLOCK_SIZE, TILE_SIZE>(
+        global_srcs_device,
+        new_global_srcs_device,
+        num_edges,
+        src_mapping);
+  } else {
+    map_vertex_ids<IdType, BLOCK_SIZE, TILE_SIZE>(
+        global_dsts_device,
+        new_global_dsts_device,
+        num_edges,
+        dst_mapping);
+  }
+}
+/**
+* \brief Device level node maps for each node type.
+*
+* \param num_nodes Number of nodes per type.
+* \param offset When offset is set to 0, LhsHashTable is identical to RhsHashTable. 
+* Or set to num_nodes.size()/2 to use seperated LhsHashTable and RhsHashTable.
+* \param ctx The DGL context.
+* \param stream The stream to operate on.
+*/
+template<typename IdType>
+class DeviceNodeMap {
+ public:
+  using Mapping = typename OrderedHashTable<IdType>::Mapping;
+  DeviceNodeMap(
+      const std::vector<int64_t>& num_nodes,
+      const int64_t offset,
+      DGLContext ctx,
+      cudaStream_t stream) :
+    num_types_(num_nodes.size()),
+    rhs_offset_(offset),
+    workspaces_(),
+    hash_tables_(),
+    ctx_(ctx) {
+    auto device = runtime::DeviceAPI::Get(ctx);
+    hash_tables_.reserve(num_types_);
+    workspaces_.reserve(num_types_);
+    for (int64_t i = 0; i < num_types_; ++i) {
+      hash_tables_.emplace_back(
+          new OrderedHashTable<IdType>(
+            num_nodes[i],
+            ctx_,
+            stream));
+    }
+  }
+  OrderedHashTable<IdType>& LhsHashTable(
+      const size_t index) {
+    return HashData(index);
+  }
+  OrderedHashTable<IdType>& RhsHashTable(
+      const size_t index) {
+    return HashData(index+rhs_offset_);
+  }
+  const OrderedHashTable<IdType>& LhsHashTable(
+      const size_t index) const {
+    return HashData(index);
+  }
+  const OrderedHashTable<IdType>& RhsHashTable(
+      const size_t index) const {
+    return HashData(index+rhs_offset_);
+  }
+  IdType LhsHashSize(
+      const size_t index) const {
+    return HashSize(index);
+  }
+  IdType RhsHashSize(
+      const size_t index) const {
+    return HashSize(rhs_offset_+index);
+  }
+  size_t Size() const {
+    return hash_tables_.size();
+  }
+ private:
+  int64_t num_types_;
+  size_t rhs_offset_;
+  std::vector<void*> workspaces_;
+  std::vector<std::unique_ptr<OrderedHashTable<IdType>>> hash_tables_;
+  DGLContext ctx_;
+  inline OrderedHashTable<IdType>& HashData(
+      const size_t index) {
+    CHECK_LT(index, hash_tables_.size());
+    return *hash_tables_[index];
+  }
+  inline const OrderedHashTable<IdType>& HashData(
+      const size_t index) const {
+    CHECK_LT(index, hash_tables_.size());
+    return *hash_tables_[index];
+  }
+  inline IdType HashSize(
+      const size_t index) const {
+    return HashData(index).size();
+  }
+};
+template<typename IdType>
+inline size_t RoundUpDiv(
+    const IdType num,
+    const size_t divisor) {
+  return static_cast<IdType>(num/divisor) + (num % divisor == 0 ? 0 : 1);
+}
+template<typename IdType>
+inline IdType RoundUp(
+    const IdType num,
+    const size_t unit) {
+  return RoundUpDiv(num, unit)*unit;
+}
+template<typename IdType>
+std::tuple<std::vector<IdArray>, std::vector<IdArray>>
+MapEdges(
+    HeteroGraphPtr graph,
+    const std::vector<EdgeArray>& edge_sets,
+    const DeviceNodeMap<IdType>& node_map,
+    cudaStream_t stream) {
+  constexpr const int BLOCK_SIZE = 128;
+  constexpr const size_t TILE_SIZE = 1024;
+  const auto& ctx = graph->Context();
+  std::vector<IdArray> new_lhs;
+  new_lhs.reserve(edge_sets.size());
+  std::vector<IdArray> new_rhs;
+  new_rhs.reserve(edge_sets.size());
+  // The next peformance optimization here, is to perform mapping of all edge
+  // types in a single kernel launch.
+  const int64_t num_edge_sets = static_cast<int64_t>(edge_sets.size());
+  for (int64_t etype = 0; etype < num_edge_sets; ++etype) {
+    const EdgeArray& edges = edge_sets[etype];
+    if (edges.id.defined() && edges.src->shape[0] > 0) {
+      const int64_t num_edges = edges.src->shape[0];
+      new_lhs.emplace_back(NewIdArray(num_edges, ctx, sizeof(IdType)*8));
+      new_rhs.emplace_back(NewIdArray(num_edges, ctx, sizeof(IdType)*8));
+      const auto src_dst_types = graph->GetEndpointTypes(etype);
+      const int src_type = src_dst_types.first;
+      const int dst_type = src_dst_types.second;
+      const dim3 grid(RoundUpDiv(num_edges, TILE_SIZE), 2);
+      const dim3 block(BLOCK_SIZE);
+      // map the srcs
+      map_edge_ids<IdType, BLOCK_SIZE, TILE_SIZE><<<
+        grid,
+        block,
+        0,
+        stream>>>(
+        edges.src.Ptr<IdType>(),
+        new_lhs.back().Ptr<IdType>(),
+        edges.dst.Ptr<IdType>(),
+        new_rhs.back().Ptr<IdType>(),
+        num_edges,
+        node_map.LhsHashTable(src_type).DeviceHandle(),
+        node_map.RhsHashTable(dst_type).DeviceHandle());
+      CUDA_CALL(cudaGetLastError());
+    } else {
+      new_lhs.emplace_back(
+          aten::NullArray(DLDataType{kDLInt, sizeof(IdType)*8, 1}, ctx));
+      new_rhs.emplace_back(
+          aten::NullArray(DLDataType{kDLInt, sizeof(IdType)*8, 1}, ctx));
+    }
+  }
+  return std::tuple<std::vector<IdArray>, std::vector<IdArray>>(
+      std::move(new_lhs), std::move(new_rhs));
+}
+}  // namespace cuda
+}  // namespace transform
+}  // namespace dgl
+#endif
--- a/src/graph/transform/cuda/cuda_to_block.cu
+++ b/src/graph/transform/cuda/cuda_to_block.cu
@@ -13,7 +13,7 @@
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 *
- * \file graph/transform/cuda_to_block.cu
+ * \file graph/transform/cuda/cuda_to_block.cu
 * \brief Functions to convert a set of edges into a graph block with local
 * ids.
 */
@@ -27,182 +27,19 @@
 #include <memory>
 #include "../../../runtime/cuda/cuda_common.h"
-#include "../../../runtime/cuda/cuda_hashtable.cuh"
 #include "../../heterograph.h"
 #include "../to_bipartite.h"
+#include "cuda_map_edges.cuh"
 using namespace dgl::aten;
 using namespace dgl::runtime::cuda;
+using namespace dgl::transform::cuda;
 namespace dgl {
 namespace transform {
 namespace {
-template<typename IdType, int BLOCK_SIZE, IdType TILE_SIZE>
-__device__ void map_vertex_ids(
-    const IdType * const global,
-    IdType * const new_global,
-    const IdType num_vertices,
-    const DeviceOrderedHashTable<IdType>& table) {
-  assert(BLOCK_SIZE == blockDim.x);
-  using Mapping = typename OrderedHashTable<IdType>::Mapping;
-  const IdType tile_start = TILE_SIZE*blockIdx.x;
-  const IdType tile_end = min(TILE_SIZE*(blockIdx.x+1), num_vertices);
-  for (IdType idx = threadIdx.x+tile_start; idx < tile_end; idx+=BLOCK_SIZE) {
-    const Mapping& mapping = *table.Search(global[idx]);
-    new_global[idx] = mapping.local;
-  }
-}
-/**
-* \brief Generate mapped edge endpoint ids.
-*
-* \tparam IdType The type of id.
-* \tparam BLOCK_SIZE The size of each thread block.
-* \tparam TILE_SIZE The number of edges to process per thread block.
-* \param global_srcs_device The source ids to map.
-* \param new_global_srcs_device The mapped source ids (output).
-* \param global_dsts_device The destination ids to map.
-* \param new_global_dsts_device The mapped destination ids (output).
-* \param num_edges The number of edges to map.
-* \param src_mapping The mapping of sources ids.
-* \param src_hash_size The the size of source id hash table/mapping.
-* \param dst_mapping The mapping of destination ids.
-* \param dst_hash_size The the size of destination id hash table/mapping.
-*/
-template<typename IdType, int BLOCK_SIZE, IdType TILE_SIZE>
-__global__ void map_edge_ids(
-    const IdType * const global_srcs_device,
-    IdType * const new_global_srcs_device,
-    const IdType * const global_dsts_device,
-    IdType * const new_global_dsts_device,
-    const IdType num_edges,
-    DeviceOrderedHashTable<IdType> src_mapping,
-    DeviceOrderedHashTable<IdType> dst_mapping) {
-  assert(BLOCK_SIZE == blockDim.x);
-  assert(2 == gridDim.y);
-  if (blockIdx.y == 0) {
-    map_vertex_ids<IdType, BLOCK_SIZE, TILE_SIZE>(
-        global_srcs_device,
-        new_global_srcs_device,
-        num_edges,
-        src_mapping);
-  } else {
-    map_vertex_ids<IdType, BLOCK_SIZE, TILE_SIZE>(
-        global_dsts_device,
-        new_global_dsts_device,
-        num_edges,
-        dst_mapping);
-  }
-}
-template<typename IdType>
-inline size_t RoundUpDiv(
-    const IdType num,
-    const size_t divisor) {
-  return static_cast<IdType>(num/divisor) + (num % divisor == 0 ? 0 : 1);
-}
-template<typename IdType>
-inline IdType RoundUp(
-    const IdType num,
-    const size_t unit) {
-  return RoundUpDiv(num, unit)*unit;
-}
-template<typename IdType>
-class DeviceNodeMap {
- public:
-  using Mapping = typename OrderedHashTable<IdType>::Mapping;
-  DeviceNodeMap(
-      const std::vector<int64_t>& num_nodes,
-      DGLContext ctx,
-      cudaStream_t stream) :
-    num_types_(num_nodes.size()),
-    rhs_offset_(num_types_/2),
-    workspaces_(),
-    hash_tables_(),
-    ctx_(ctx) {
-    auto device = runtime::DeviceAPI::Get(ctx);
-    hash_tables_.reserve(num_types_);
-    workspaces_.reserve(num_types_);
-    for (int64_t i = 0; i < num_types_; ++i) {
-      hash_tables_.emplace_back(
-          new OrderedHashTable<IdType>(
-            num_nodes[i],
-            ctx_,
-            stream));
-    }
-  }
-  OrderedHashTable<IdType>& LhsHashTable(
-      const size_t index) {
-    return HashData(index);
-  }
-  OrderedHashTable<IdType>& RhsHashTable(
-      const size_t index) {
-    return HashData(index+rhs_offset_);
-  }
-  const OrderedHashTable<IdType>& LhsHashTable(
-      const size_t index) const {
-    return HashData(index);
-  }
-  const OrderedHashTable<IdType>& RhsHashTable(
-      const size_t index) const {
-    return HashData(index+rhs_offset_);
-  }
-  IdType LhsHashSize(
-      const size_t index) const {
-    return HashSize(index);
-  }
-  IdType RhsHashSize(
-      const size_t index) const {
-    return HashSize(rhs_offset_+index);
-  }
-  size_t Size() const {
-    return hash_tables_.size();
-  }
- private:
-  int64_t num_types_;
-  size_t rhs_offset_;
-  std::vector<void*> workspaces_;
-  std::vector<std::unique_ptr<OrderedHashTable<IdType>>> hash_tables_;
-  DGLContext ctx_;
-  inline OrderedHashTable<IdType>& HashData(
-      const size_t index) {
-    CHECK_LT(index, hash_tables_.size());
-    return *hash_tables_[index];
-  }
-  inline const OrderedHashTable<IdType>& HashData(
-      const size_t index) const {
-    CHECK_LT(index, hash_tables_.size());
-    return *hash_tables_[index];
-  }
-  inline IdType HashSize(
-      const size_t index) const {
-    return HashData(index).size();
-  }
-};
 template<typename IdType>
 class DeviceNodeMapMaker {
 public:
@@ -315,67 +152,6 @@ class DeviceNodeMapMaker {
  IdType max_num_nodes_;
 };
-template<typename IdType>
-std::tuple<std::vector<IdArray>, std::vector<IdArray>>
-MapEdges(
-    HeteroGraphPtr graph,
-    const std::vector<EdgeArray>& edge_sets,
-    const DeviceNodeMap<IdType>& node_map,
-    cudaStream_t stream) {
-  constexpr const int BLOCK_SIZE = 128;
-  constexpr const size_t TILE_SIZE = 1024;
-  const auto& ctx = graph->Context();
-  std::vector<IdArray> new_lhs;
-  new_lhs.reserve(edge_sets.size());
-  std::vector<IdArray> new_rhs;
-  new_rhs.reserve(edge_sets.size());
-  // The next peformance optimization here, is to perform mapping of all edge
-  // types in a single kernel launch.
-  const int64_t num_edge_sets = static_cast<int64_t>(edge_sets.size());
-  for (int64_t etype = 0; etype < num_edge_sets; ++etype) {
-    const EdgeArray& edges = edge_sets[etype];
-    if (edges.id.defined() && edges.src->shape[0] > 0) {
-      const int64_t num_edges = edges.src->shape[0];
-      new_lhs.emplace_back(NewIdArray(num_edges, ctx, sizeof(IdType)*8));
-      new_rhs.emplace_back(NewIdArray(num_edges, ctx, sizeof(IdType)*8));
-      const auto src_dst_types = graph->GetEndpointTypes(etype);
-      const int src_type = src_dst_types.first;
-      const int dst_type = src_dst_types.second;
-      const dim3 grid(RoundUpDiv(num_edges, TILE_SIZE), 2);
-      const dim3 block(BLOCK_SIZE);
-      // map the srcs
-      map_edge_ids<IdType, BLOCK_SIZE, TILE_SIZE><<<
-        grid,
-        block,
-        0,
-        stream>>>(
-        edges.src.Ptr<IdType>(),
-        new_lhs.back().Ptr<IdType>(),
-        edges.dst.Ptr<IdType>(),
-        new_rhs.back().Ptr<IdType>(),
-        num_edges,
-        node_map.LhsHashTable(src_type).DeviceHandle(),
-        node_map.RhsHashTable(dst_type).DeviceHandle());
-      CUDA_CALL(cudaGetLastError());
-    } else {
-      new_lhs.emplace_back(
-          aten::NullArray(DLDataType{kDLInt, sizeof(IdType)*8, 1}, ctx));
-      new_rhs.emplace_back(
-          aten::NullArray(DLDataType{kDLInt, sizeof(IdType)*8, 1}, ctx));
-    }
-  }
-  return std::tuple<std::vector<IdArray>, std::vector<IdArray>>(
-      std::move(new_lhs), std::move(new_rhs));
-}
 // Since partial specialization is not allowed for functions, use this as an
 // intermediate for ToBlock where XPU = kDLGPU.
@@ -487,7 +263,7 @@ ToBlockGPU(
  // allocate space for map creation process
  DeviceNodeMapMaker<IdType> maker(maxNodesPerType);
-  DeviceNodeMap<IdType> node_maps(maxNodesPerType, ctx, stream);
+  DeviceNodeMap<IdType> node_maps(maxNodesPerType, num_ntypes, ctx, stream);
  if (generate_lhs_nodes) {
    lhs_nodes.reserve(num_ntypes);

--- a/src/graph/transform/cuda/cuda_to_block.h
+++ b/src/graph/transform/cuda/cuda_to_block.h
-/*!
- *  Copyright (c) 2020 by Contributors
- * \file graph/transform/cuda_to_block.h
- * \brief Functions to convert a set of edges into a graph block with local
- * ids.
- */
-#ifndef DGL_GRAPH_TRANSFORM_CUDA_CUDA_TO_BLOCK_H_
-#define DGL_GRAPH_TRANSFORM_CUDA_CUDA_TO_BLOCK_H_
-#include <dgl/array.h>
-#include <dgl/base_heterograph.h>
-#include <vector>
-#include <tuple>
-namespace dgl {
-namespace transform {
-namespace cuda {
-/**
- * @brief Generate a subgraph with locally numbered vertices, from the given
- * edge set.
- *
- * @param graph The set of edges to construct the subgraph from.
- * @param rhs_nodes The unique set of destination vertices.
- * @param include_rhs_in_lhs Whether or not to include the `rhs_nodes` in the
- * set of source vertices for purposes of local numbering.
- *
- * @return The subgraph, the unique set of source nodes, and the mapping of
- * subgraph edges to global edges.
- */
-std::tuple<HeteroGraphPtr, std::vector<IdArray>, std::vector<IdArray>>
-CudaToBlock(
-    HeteroGraphPtr graph,
-    const std::vector<IdArray>& rhs_nodes,
-    const bool include_rhs_in_lhs);
-}  // namespace cuda
-}  // namespace transform
-}  // namespace dgl
-#endif  // DGL_GRAPH_TRANSFORM_CUDA_CUDA_TO_BLOCK_H_
--- a/src/graph/transform/to_bipartite.h
+++ b/src/graph/transform/to_bipartite.h
@@ -13,7 +13,7 @@
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 *
- * \file graph/transform/cuda_to_block.cu
+ * \file graph/transform/to_bipartite.h
 * \brief Functions to convert a set of edges into a graph block with local
 * ids.
 */

--- a/tests/compute/test_transform.py
+++ b/tests/compute/test_transform.py
@@ -662,24 +662,23 @@ def test_reorder_nodes():
        old_neighs2 = g.predecessors(old_nid)
        assert np.all(np.sort(old_neighs1) == np.sort(F.asnumpy(old_neighs2)))
-@unittest.skipIf(F._default_context_str == 'gpu', reason="GPU compaction not implemented")
 @parametrize_dtype
 def test_compact(idtype):
    g1 = dgl.heterograph({
        ('user', 'follow', 'user'): ([1, 3], [3, 5]),
        ('user', 'plays', 'game'): ([2, 3, 2], [4, 4, 5]),
        ('game', 'wished-by', 'user'): ([6, 5], [7, 7])},
-        {'user': 20, 'game': 10}, idtype=idtype)
+        {'user': 20, 'game': 10}, idtype=idtype, device=F.ctx())
    g2 = dgl.heterograph({
        ('game', 'clicked-by', 'user'): ([3], [1]),
        ('user', 'likes', 'user'): ([1, 8], [8, 9])},
-        {'user': 20, 'game': 10}, idtype=idtype)
+        {'user': 20, 'game': 10}, idtype=idtype, device=F.ctx())
    g3 = dgl.heterograph({('user', '_E', 'user'): ((0, 1), (1, 2))},
-                         {'user': 10}, idtype=idtype)
+                         {'user': 10}, idtype=idtype, device=F.ctx())
    g4 = dgl.heterograph({('user', '_E', 'user'): ((1, 3), (3, 5))},
-                         {'user': 10}, idtype=idtype)
+                         {'user': 10}, idtype=idtype, device=F.ctx())
    def _check(g, new_g, induced_nodes):
        assert g.ntypes == new_g.ntypes