[Misc] clang-format auto fix. (#4810)

* [Misc] clang-format auto fix. * manual * manual Co-authored-by: Steve <ubuntu@ip-172-31-34-29.ap-northeast-1.compute.internal>

[Misc] clang-format auto fix. (#4810)
* [Misc] clang-format auto fix. * manual * manual Co-authored-by: Steve <ubuntu@ip-172-31-34-29.ap-northeast-1.compute.internal>
8f0df39e · Hongzhi (Steve), Chen · GitHub · 401e1278 · 8f0df39e · 8f0df39e
Unverified Commit 8f0df39e authored Nov 04, 2022 by Hongzhi (Steve), Chen Committed by GitHub Nov 04, 2022
20 changed files
--- a/src/graph/serialize/zerocopy_serializer.cc
+++ b/src/graph/serialize/zerocopy_serializer.cc
@@ -13,8 +13,8 @@ namespace dgl {

 using dgl::runtime::NDArray;

-NDArray CreateNDArrayFromRawData(std::vector<int64_t> shape, DGLDataType dtype,
-                                 DGLContext ctx, void* raw) {
+NDArray CreateNDArrayFromRawData(
+    std::vector<int64_t> shape, DGLDataType dtype, DGLContext ctx, void* raw) {
  return NDArray::CreateFromRaw(shape, dtype, ctx, raw, true);
 }

@@ -25,9 +25,9 @@ void StreamWithBuffer::PushNDArray(const NDArray& tensor) {
  int ndim = tensor->ndim;
  this->WriteArray(tensor->shape, ndim);
  CHECK(tensor.IsContiguous())
-    << "StreamWithBuffer only supports contiguous tensor";
+      << "StreamWithBuffer only supports contiguous tensor";
  CHECK_EQ(tensor->byte_offset, 0)
-    << "StreamWithBuffer only supports zero byte offset tensor";
+      << "StreamWithBuffer only supports zero byte offset tensor";
  int type_bytes = tensor->dtype.bits / 8;
  int64_t num_elems = 1;
  for (int i = 0; i < ndim; ++i) {
@@ -40,7 +40,8 @@ void StreamWithBuffer::PushNDArray(const NDArray& tensor) {
    // If the stream is for remote communication or the data is not stored in
    // shared memory, serialize the data content as a buffer.
    this->Write<bool>(false);
-    // If this is a null ndarray, we will not push it into the underlying buffer_list
+    // If this is a null ndarray, we will not push it into the underlying
+    // buffer_list
    if (data_byte_size != 0) {
      buffer_list_.emplace_back(tensor, tensor->data, data_byte_size);
    }
@@ -90,8 +91,8 @@ NDArray StreamWithBuffer::PopNDArray() {
      // Mean this is a null ndarray
      ret = CreateNDArrayFromRawData(shape, dtype, cpu_ctx, nullptr);
    } else {
-      ret = CreateNDArrayFromRawData(shape, dtype, cpu_ctx,
-                                     buffer_list_.front().data);
+      ret = CreateNDArrayFromRawData(
+          shape, dtype, cpu_ctx, buffer_list_.front().data);
      buffer_list_.pop_front();
    }
    return ret;

--- a/src/graph/shared_mem_manager.cc
+++ b/src/graph/shared_mem_manager.cc
@@ -31,8 +31,8 @@ using namespace dgl::aten;
 namespace dgl {

 template <>
-NDArray SharedMemManager::CopyToSharedMem<NDArray>(const NDArray &data,
-                                                   std::string name) {
+NDArray SharedMemManager::CopyToSharedMem<NDArray>(
+    const NDArray &data, std::string name) {
  DGLContext ctx = {kDGLCPU, 0};
  std::vector<int64_t> shape(data->shape, data->shape + data->ndim);
  strm_->Write(data->ndim);
@@ -46,28 +46,29 @@ NDArray SharedMemManager::CopyToSharedMem<NDArray>(const NDArray &data,
    return data;
  } else {
    auto nd =
-      NDArray::EmptyShared(graph_name_ + name, shape, data->dtype, ctx, true);
+        NDArray::EmptyShared(graph_name_ + name, shape, data->dtype, ctx, true);
    nd.CopyFrom(data);
    return nd;
  }
 }

 template <>
-CSRMatrix SharedMemManager::CopyToSharedMem<CSRMatrix>(const CSRMatrix &csr,
-                                                       std::string name) {
+CSRMatrix SharedMemManager::CopyToSharedMem<CSRMatrix>(
+    const CSRMatrix &csr, std::string name) {
  auto indptr_shared_mem = CopyToSharedMem(csr.indptr, name + "_indptr");
  auto indices_shared_mem = CopyToSharedMem(csr.indices, name + "_indices");
  auto data_shared_mem = CopyToSharedMem(csr.data, name + "_data");
  strm_->Write(csr.num_rows);
  strm_->Write(csr.num_cols);
  strm_->Write(csr.sorted);
-  return CSRMatrix(csr.num_rows, csr.num_cols, indptr_shared_mem,
-                   indices_shared_mem, data_shared_mem, csr.sorted);
+  return CSRMatrix(
+      csr.num_rows, csr.num_cols, indptr_shared_mem, indices_shared_mem,
+      data_shared_mem, csr.sorted);
 }

 template <>
-COOMatrix SharedMemManager::CopyToSharedMem<COOMatrix>(const COOMatrix &coo,
-                                                       std::string name) {
+COOMatrix SharedMemManager::CopyToSharedMem<COOMatrix>(
+    const COOMatrix &coo, std::string name) {
  auto row_shared_mem = CopyToSharedMem(coo.row, name + "_row");
  auto col_shared_mem = CopyToSharedMem(coo.col, name + "_col");
  auto data_shared_mem = CopyToSharedMem(coo.data, name + "_data");
@@ -75,13 +76,14 @@ COOMatrix SharedMemManager::CopyToSharedMem<COOMatrix>(const COOMatrix &coo,
  strm_->Write(coo.num_cols);
  strm_->Write(coo.row_sorted);
  strm_->Write(coo.col_sorted);
-  return COOMatrix(coo.num_rows, coo.num_cols, row_shared_mem, col_shared_mem,
-                   data_shared_mem, coo.row_sorted, coo.col_sorted);
+  return COOMatrix(
+      coo.num_rows, coo.num_cols, row_shared_mem, col_shared_mem,
+      data_shared_mem, coo.row_sorted, coo.col_sorted);
 }

 template <>
-bool SharedMemManager::CreateFromSharedMem<NDArray>(NDArray *nd,
-                                                    std::string name) {
+bool SharedMemManager::CreateFromSharedMem<NDArray>(
+    NDArray *nd, std::string name) {
  int ndim;
  DGLContext ctx = {kDGLCPU, 0};
  DGLDataType dtype;
@@ -98,15 +100,14 @@ bool SharedMemManager::CreateFromSharedMem<NDArray>(NDArray *nd,
  if (is_null) {
    *nd = NDArray::Empty(shape, dtype, ctx);
  } else {
-    *nd =
-      NDArray::EmptyShared(graph_name_ + name, shape, dtype, ctx, false);
+    *nd = NDArray::EmptyShared(graph_name_ + name, shape, dtype, ctx, false);
  }
  return true;
 }

 template <>
-bool SharedMemManager::CreateFromSharedMem<COOMatrix>(COOMatrix *coo,
-                                                      std::string name) {
+bool SharedMemManager::CreateFromSharedMem<COOMatrix>(
+    COOMatrix *coo, std::string name) {
  CreateFromSharedMem(&coo->row, name + "_row");
  CreateFromSharedMem(&coo->col, name + "_col");
  CreateFromSharedMem(&coo->data, name + "_data");
@@ -118,8 +119,8 @@ bool SharedMemManager::CreateFromSharedMem<COOMatrix>(COOMatrix *coo,
 }

 template <>
-bool SharedMemManager::CreateFromSharedMem<CSRMatrix>(CSRMatrix *csr,
-                                                      std::string name) {
+bool SharedMemManager::CreateFromSharedMem<CSRMatrix>(
+    CSRMatrix *csr, std::string name) {
  CreateFromSharedMem(&csr->indptr, name + "_indptr");
  CreateFromSharedMem(&csr->indices, name + "_indices");
  CreateFromSharedMem(&csr->data, name + "_data");

--- a/src/graph/shared_mem_manager.h
+++ b/src/graph/shared_mem_manager.h
@@ -29,8 +29,7 @@ const size_t SHARED_MEM_METAINFO_SIZE_MAX = 1024 * 32;
 class SharedMemManager : public dmlc::Stream {
 public:
  explicit SharedMemManager(std::string graph_name, dmlc::Stream* strm)
-      : graph_name_(graph_name),
-        strm_(strm) {}
+      : graph_name_(graph_name), strm_(strm) {}

  template <typename T>
  T CopyToSharedMem(const T& data, std::string name);

--- a/src/graph/subgraph.cc
+++ b/src/graph/subgraph.cc
@@ -11,7 +11,8 @@ namespace dgl {
 HeteroSubgraph InEdgeGraphRelabelNodes(
    const HeteroGraphPtr graph, const std::vector<IdArray>& vids) {
  CHECK_EQ(vids.size(), graph->NumVertexTypes())
-    << "Invalid input: the input list size must be the same as the number of vertex types.";
+      << "Invalid input: the input list size must be the same as the number of "
+         "vertex types.";
  std::vector<IdArray> eids(graph->NumEdgeTypes());
  DGLContext ctx = aten::GetContextOf(vids);
  for (dgl_type_t etype = 0; etype < graph->NumEdgeTypes(); ++etype) {
@@ -29,9 +30,11 @@ HeteroSubgraph InEdgeGraphRelabelNodes(

 HeteroSubgraph InEdgeGraphNoRelabelNodes(
    const HeteroGraphPtr graph, const std::vector<IdArray>& vids) {
-  // TODO(mufei): This should also use EdgeSubgraph once it is supported for CSR graphs
+  // TODO(mufei): This should also use EdgeSubgraph once it is supported for CSR
+  // graphs
  CHECK_EQ(vids.size(), graph->NumVertexTypes())
-    << "Invalid input: the input list size must be the same as the number of vertex types.";
+      << "Invalid input: the input list size must be the same as the number of "
+         "vertex types.";
  std::vector<HeteroGraphPtr> subrels(graph->NumEdgeTypes());
  std::vector<IdArray> induced_edges(graph->NumEdgeTypes());
  DGLContext ctx = aten::GetContextOf(vids);
@@ -43,30 +46,28 @@ HeteroSubgraph InEdgeGraphNoRelabelNodes(
    if (aten::IsNullArray(vids[dst_vtype])) {
      // create a placeholder graph
      subrels[etype] = UnitGraph::Empty(
-        relgraph->NumVertexTypes(),
-        graph->NumVertices(src_vtype),
-        graph->NumVertices(dst_vtype),
-        graph->DataType(), ctx);
-      induced_edges[etype] = IdArray::Empty({0}, graph->DataType(), graph->Context());
+          relgraph->NumVertexTypes(), graph->NumVertices(src_vtype),
+          graph->NumVertices(dst_vtype), graph->DataType(), ctx);
+      induced_edges[etype] =
+          IdArray::Empty({0}, graph->DataType(), graph->Context());
    } else {
      const auto& earr = graph->InEdges(etype, {vids[dst_vtype]});
      subrels[etype] = UnitGraph::CreateFromCOO(
-        relgraph->NumVertexTypes(),
-        graph->NumVertices(src_vtype),
-        graph->NumVertices(dst_vtype),
-        earr.src,
-        earr.dst);
+          relgraph->NumVertexTypes(), graph->NumVertices(src_vtype),
+          graph->NumVertices(dst_vtype), earr.src, earr.dst);
      induced_edges[etype] = earr.id;
    }
  }
  HeteroSubgraph ret;
-  ret.graph = CreateHeteroGraph(graph->meta_graph(), subrels, graph->NumVerticesPerType());
+  ret.graph = CreateHeteroGraph(
+      graph->meta_graph(), subrels, graph->NumVerticesPerType());
  ret.induced_edges = std::move(induced_edges);
  return ret;
 }

 HeteroSubgraph InEdgeGraph(
-    const HeteroGraphPtr graph, const std::vector<IdArray>& vids, bool relabel_nodes) {
+    const HeteroGraphPtr graph, const std::vector<IdArray>& vids,
+    bool relabel_nodes) {
  if (relabel_nodes) {
    return InEdgeGraphRelabelNodes(graph, vids);
  } else {
@@ -77,7 +78,8 @@ HeteroSubgraph InEdgeGraph(
 HeteroSubgraph OutEdgeGraphRelabelNodes(
    const HeteroGraphPtr graph, const std::vector<IdArray>& vids) {
  CHECK_EQ(vids.size(), graph->NumVertexTypes())
-    << "Invalid input: the input list size must be the same as the number of vertex types.";
+      << "Invalid input: the input list size must be the same as the number of "
+         "vertex types.";
  std::vector<IdArray> eids(graph->NumEdgeTypes());
  DGLContext ctx = aten::GetContextOf(vids);
  for (dgl_type_t etype = 0; etype < graph->NumEdgeTypes(); ++etype) {
@@ -95,9 +97,11 @@ HeteroSubgraph OutEdgeGraphRelabelNodes(

 HeteroSubgraph OutEdgeGraphNoRelabelNodes(
    const HeteroGraphPtr graph, const std::vector<IdArray>& vids) {
-  // TODO(mufei): This should also use EdgeSubgraph once it is supported for CSR graphs
+  // TODO(mufei): This should also use EdgeSubgraph once it is supported for CSR
+  // graphs
  CHECK_EQ(vids.size(), graph->NumVertexTypes())
-    << "Invalid input: the input list size must be the same as the number of vertex types.";
+      << "Invalid input: the input list size must be the same as the number of "
+         "vertex types.";
  std::vector<HeteroGraphPtr> subrels(graph->NumEdgeTypes());
  std::vector<IdArray> induced_edges(graph->NumEdgeTypes());
  DGLContext ctx = aten::GetContextOf(vids);
@@ -109,30 +113,28 @@ HeteroSubgraph OutEdgeGraphNoRelabelNodes(
    if (aten::IsNullArray(vids[src_vtype])) {
      // create a placeholder graph
      subrels[etype] = UnitGraph::Empty(
-        relgraph->NumVertexTypes(),
-        graph->NumVertices(src_vtype),
-        graph->NumVertices(dst_vtype),
-        graph->DataType(), ctx);
-      induced_edges[etype] = IdArray::Empty({0}, graph->DataType(), graph->Context());
+          relgraph->NumVertexTypes(), graph->NumVertices(src_vtype),
+          graph->NumVertices(dst_vtype), graph->DataType(), ctx);
+      induced_edges[etype] =
+          IdArray::Empty({0}, graph->DataType(), graph->Context());
    } else {
      const auto& earr = graph->OutEdges(etype, {vids[src_vtype]});
      subrels[etype] = UnitGraph::CreateFromCOO(
-          relgraph->NumVertexTypes(),
-          graph->NumVertices(src_vtype),
-          graph->NumVertices(dst_vtype),
-          earr.src,
-          earr.dst);
+          relgraph->NumVertexTypes(), graph->NumVertices(src_vtype),
+          graph->NumVertices(dst_vtype), earr.src, earr.dst);
      induced_edges[etype] = earr.id;
    }
  }
  HeteroSubgraph ret;
-  ret.graph = CreateHeteroGraph(graph->meta_graph(), subrels, graph->NumVerticesPerType());
+  ret.graph = CreateHeteroGraph(
+      graph->meta_graph(), subrels, graph->NumVerticesPerType());
  ret.induced_edges = std::move(induced_edges);
  return ret;
 }

 HeteroSubgraph OutEdgeGraph(
-    const HeteroGraphPtr graph, const std::vector<IdArray>& vids, bool relabel_nodes) {
+    const HeteroGraphPtr graph, const std::vector<IdArray>& vids,
+    bool relabel_nodes) {
  if (relabel_nodes) {
    return OutEdgeGraphRelabelNodes(graph, vids);
  } else {

--- a/src/graph/transform/compact.cc
+++ b/src/graph/transform/compact.cc
@@ -19,18 +19,20 @@

 #include "compact.h"

-#include <dgl/base_heterograph.h>
-#include <dgl/transform.h>
 #include <dgl/array.h>
+#include <dgl/base_heterograph.h>
 #include <dgl/packed_func_ext.h>
-#include <dgl/runtime/registry.h>
 #include <dgl/runtime/container.h>
-#include <vector>
+#include <dgl/runtime/registry.h>
+#include <dgl/transform.h>
+
 #include <utility>
+#include <vector>
+
 #include "../../c_api_common.h"
 #include "../unit_graph.h"
-// TODO(BarclayII): currently CompactGraphs depend on IdHashMap implementation which
-// only works on CPU.  Should fix later to make it device agnostic.
+// TODO(BarclayII): currently CompactGraphs depend on IdHashMap implementation
+// which only works on CPU.  Should fix later to make it device agnostic.
 #include "../../array/cpu/array_utils.h"

 namespace dgl {
@@ -42,16 +44,16 @@ namespace transform {

 namespace {

-template<typename IdType>
-std::pair<std::vector<HeteroGraphPtr>, std::vector<IdArray>>
-CompactGraphsCPU(
+template <typename IdType>
+std::pair<std::vector<HeteroGraphPtr>, std::vector<IdArray>> CompactGraphsCPU(
    const std::vector<HeteroGraphPtr> &graphs,
    const std::vector<IdArray> &always_preserve) {
-  // TODO(BarclayII): check whether the node space and metagraph of each graph is the same.
-  // Step 1: Collect the nodes that has connections for each type.
+  // TODO(BarclayII): check whether the node space and metagraph of each graph
+  // is the same. Step 1: Collect the nodes that has connections for each type.
  const int64_t num_ntypes = graphs[0]->NumVertexTypes();
  std::vector<aten::IdHashMap<IdType>> hashmaps(num_ntypes);
-  std::vector<std::vector<EdgeArray>> all_edges(graphs.size());   // all_edges[i][etype]
+  std::vector<std::vector<EdgeArray>> all_edges(
+      graphs.size());  // all_edges[i][etype]

  std::vector<int64_t> max_vertex_cnt(num_ntypes, 0);
  for (size_t i = 0; i < graphs.size(); ++i) {
@@ -98,7 +100,8 @@ CompactGraphsCPU(
    }
  }

-  // Step 2: Relabel the nodes for each type to a smaller ID space and save the mapping.
+  // Step 2: Relabel the nodes for each type to a smaller ID space and save the
+  // mapping.
  std::vector<IdArray> induced_nodes(num_ntypes);
  std::vector<int64_t> num_induced_nodes(num_ntypes);
  for (int64_t i = 0; i < num_ntypes; ++i) {
@@ -123,14 +126,12 @@ CompactGraphsCPU(
      const IdArray mapped_cols = hashmaps[dsttype].Map(edges.dst, -1);

      rel_graphs.push_back(UnitGraph::CreateFromCOO(
-          srctype == dsttype ? 1 : 2,
-          induced_nodes[srctype]->shape[0],
-          induced_nodes[dsttype]->shape[0],
-          mapped_rows,
-          mapped_cols));
+          srctype == dsttype ? 1 : 2, induced_nodes[srctype]->shape[0],
+          induced_nodes[dsttype]->shape[0], mapped_rows, mapped_cols));
    }

-    new_graphs.push_back(CreateHeteroGraph(meta_graph, rel_graphs, num_induced_nodes));
+    new_graphs.push_back(
+        CreateHeteroGraph(meta_graph, rel_graphs, num_induced_nodes));
  }

  return std::make_pair(new_graphs, induced_nodes);
@@ -138,7 +139,7 @@ CompactGraphsCPU(

 };  // namespace

-template<>
+template <>
 std::pair<std::vector<HeteroGraphPtr>, std::vector<IdArray>>
 CompactGraphs<kDGLCPU, int32_t>(
    const std::vector<HeteroGraphPtr> &graphs,
@@ -146,7 +147,7 @@ CompactGraphs<kDGLCPU, int32_t>(
  return CompactGraphsCPU<int32_t>(graphs, always_preserve);
 }

-template<>
+template <>
 std::pair<std::vector<HeteroGraphPtr>, std::vector<IdArray>>
 CompactGraphs<kDGLCPU, int64_t>(
    const std::vector<HeteroGraphPtr> &graphs,
@@ -155,44 +156,44 @@ CompactGraphs<kDGLCPU, int64_t>(
 }

 DGL_REGISTER_GLOBAL("transform._CAPI_DGLCompactGraphs")
-.set_body([] (DGLArgs args, DGLRetValue* rv) {
-    List<HeteroGraphRef> graph_refs = args[0];
-    List<Value> always_preserve_refs = args[1];
-
-    std::vector<HeteroGraphPtr> graphs;
-    std::vector<IdArray> always_preserve;
-    for (HeteroGraphRef gref : graph_refs)
-      graphs.push_back(gref.sptr());
-    for (Value array : always_preserve_refs)
-      always_preserve.push_back(array->data);
-
-    // TODO(BarclayII): check for all IdArrays
-    CHECK(graphs[0]->DataType() == always_preserve[0]->dtype) << "data type mismatch.";
-
-    std::pair<std::vector<HeteroGraphPtr>, std::vector<IdArray>> result_pair;
-
-    ATEN_XPU_SWITCH_CUDA(graphs[0]->Context().device_type, XPU, "CompactGraphs", {
-      ATEN_ID_TYPE_SWITCH(graphs[0]->DataType(), IdType, {
-        result_pair = CompactGraphs<XPU, IdType>(
-          graphs, always_preserve);
-      });
+    .set_body([](DGLArgs args, DGLRetValue *rv) {
+      List<HeteroGraphRef> graph_refs = args[0];
+      List<Value> always_preserve_refs = args[1];
+
+      std::vector<HeteroGraphPtr> graphs;
+      std::vector<IdArray> always_preserve;
+      for (HeteroGraphRef gref : graph_refs) graphs.push_back(gref.sptr());
+      for (Value array : always_preserve_refs)
+        always_preserve.push_back(array->data);
+
+      // TODO(BarclayII): check for all IdArrays
+      CHECK(graphs[0]->DataType() == always_preserve[0]->dtype)
+          << "data type mismatch.";
+
+      std::pair<std::vector<HeteroGraphPtr>, std::vector<IdArray>> result_pair;
+
+      ATEN_XPU_SWITCH_CUDA(
+          graphs[0]->Context().device_type, XPU, "CompactGraphs", {
+            ATEN_ID_TYPE_SWITCH(graphs[0]->DataType(), IdType, {
+              result_pair = CompactGraphs<XPU, IdType>(graphs, always_preserve);
+            });
+          });
+
+      List<HeteroGraphRef> compacted_graph_refs;
+      List<Value> induced_nodes;
+
+      for (const HeteroGraphPtr g : result_pair.first)
+        compacted_graph_refs.push_back(HeteroGraphRef(g));
+      for (const IdArray &ids : result_pair.second)
+        induced_nodes.push_back(Value(MakeValue(ids)));
+
+      List<ObjectRef> result;
+      result.push_back(compacted_graph_refs);
+      result.push_back(induced_nodes);
+
+      *rv = result;
    });

-    List<HeteroGraphRef> compacted_graph_refs;
-    List<Value> induced_nodes;
-
-    for (const HeteroGraphPtr g : result_pair.first)
-      compacted_graph_refs.push_back(HeteroGraphRef(g));
-    for (const IdArray &ids : result_pair.second)
-      induced_nodes.push_back(Value(MakeValue(ids)));
-
-    List<ObjectRef> result;
-    result.push_back(compacted_graph_refs);
-    result.push_back(induced_nodes);
-
-    *rv = result;
-  });
-
 };  // namespace transform

 };  // namespace dgl
--- a/src/graph/transform/compact.h
+++ b/src/graph/transform/compact.h
@@ -24,8 +24,8 @@
 #include <dgl/array.h>
 #include <dgl/base_heterograph.h>

-#include <vector>
 #include <utility>
+#include <vector>

 namespace dgl {
 namespace transform {
@@ -41,9 +41,8 @@ namespace transform {
 *
 * @return The vector of compacted graphs and the vector of induced nodes.
 */
-template<DGLDeviceType XPU, typename IdType>
-std::pair<std::vector<HeteroGraphPtr>, std::vector<IdArray>>
-CompactGraphs(
+template <DGLDeviceType XPU, typename IdType>
+std::pair<std::vector<HeteroGraphPtr>, std::vector<IdArray>> CompactGraphs(
    const std::vector<HeteroGraphPtr> &graphs,
    const std::vector<IdArray> &always_preserve);


--- a/src/graph/transform/cpu/kdtree_ndarray_adapter.h
+++ b/src/graph/transform/cpu/kdtree_ndarray_adapter.h
@@ -9,7 +9,9 @@

 #include <dgl/array.h>
 #include <dmlc/logging.h>
+
 #include <nanoflann.hpp>
+
 #include "../../../c_api_common.h"

 namespace dgl {
@@ -17,78 +19,75 @@ namespace transform {
 namespace knn_utils {

 /*!
- * \brief A simple 2D NDArray adapter for nanoflann, without duplicating the storage.
+ * \brief A simple 2D NDArray adapter for nanoflann, without duplicating the
+ *        storage.
 *
- * \tparam FloatType: The type of the point coordinates (typically, double or float).
- * \tparam IdType: The type for indices in the KD-tree index (typically, size_t of int)
- * \tparam FeatureDim: If set to > 0, it specifies a compile-time fixed dimensionality
- *         for the points in the data set, allowing more compiler optimizations.
- * \tparam Dist: The distance metric to use: nanoflann::metric_L1, nanoflann::metric_L2,
- *         nanoflann::metric_L2_Simple, etc.
- * \note The spelling of dgl's adapter ("adapter") is different from naneflann ("adaptor")
+ * \tparam FloatType: The type of the point coordinates (typically, double or
+ *         float).
+ * \tparam IdType: The type for indices in the KD-tree index (typically,
+ *         size_t of int)
+ * \tparam FeatureDim: If set to > 0, it specifies a compile-time fixed
+ *         dimensionality for the points in the data set, allowing more compiler
+ *         optimizations.
+ * \tparam Dist: The distance metric to use: nanoflann::metric_L1,
+           nanoflann::metric_L2, nanoflann::metric_L2_Simple, etc.
+ * \note The spelling of dgl's adapter ("adapter") is different from naneflann
+ *       ("adaptor")
 */
-template <typename FloatType,
-          typename IdType,
-          int FeatureDim = -1,
-          typename Dist = nanoflann::metric_L2>
+template <
+    typename FloatType, typename IdType, int FeatureDim = -1,
+    typename Dist = nanoflann::metric_L2>
 class KDTreeNDArrayAdapter {
 public:
  using self_type = KDTreeNDArrayAdapter<FloatType, IdType, FeatureDim, Dist>;
-  using metric_type = typename Dist::template traits<FloatType, self_type>::distance_t;
+  using metric_type =
+      typename Dist::template traits<FloatType, self_type>::distance_t;
  using index_type = nanoflann::KDTreeSingleIndexAdaptor<
-    metric_type, self_type, FeatureDim, IdType>;
+      metric_type, self_type, FeatureDim, IdType>;

-  KDTreeNDArrayAdapter(const size_t /* dims */,
-                       const NDArray data_points,
-                       const int leaf_max_size = 10)
+  KDTreeNDArrayAdapter(
+      const size_t /* dims */, const NDArray data_points,
+      const int leaf_max_size = 10)
      : data_(data_points) {
    CHECK(data_points->shape[0] != 0 && data_points->shape[1] != 0)
-      << "Tensor containing input data point set must be 2D.";
+        << "Tensor containing input data point set must be 2D.";
    const size_t dims = data_points->shape[1];
    CHECK(!(FeatureDim > 0 && static_cast<int>(dims) != FeatureDim))
-      << "Data set feature dimension does not match the 'FeatureDim' "
-      << "template argument.";
+        << "Data set feature dimension does not match the 'FeatureDim' "
+        << "template argument.";
    index_ = new index_type(
-      static_cast<int>(dims), *this, nanoflann::KDTreeSingleIndexAdaptorParams(leaf_max_size));
+        static_cast<int>(dims), *this,
+        nanoflann::KDTreeSingleIndexAdaptorParams(leaf_max_size));
    index_->buildIndex();
  }

-  ~KDTreeNDArrayAdapter() {
-    delete index_;
-  }
+  ~KDTreeNDArrayAdapter() { delete index_; }

-  index_type* GetIndex() {
-    return index_;
-  }
+  index_type* GetIndex() { return index_; }

  /*!
   * \brief Query for the \a num_closest points to a given point
   *  Note that this is a short-cut method for GetIndex()->findNeighbors().
   */
-  void query(const FloatType* query_pt, const size_t num_closest,
-             IdType* out_idxs, FloatType* out_dists) const {
+  void query(
+      const FloatType* query_pt, const size_t num_closest, IdType* out_idxs,
+      FloatType* out_dists) const {
    nanoflann::KNNResultSet<FloatType, IdType> resultSet(num_closest);
    resultSet.init(out_idxs, out_dists);
    index_->findNeighbors(resultSet, query_pt, nanoflann::SearchParams());
  }

  /*! \brief Interface expected by KDTreeSingleIndexAdaptor */
-  const self_type& derived() const {
-    return *this;
-  }
+  const self_type& derived() const { return *this; }

  /*! \brief Interface expected by KDTreeSingleIndexAdaptor */
-  self_type& derived() {
-    return *this;
-  }
+  self_type& derived() { return *this; }

  /*!
   * \brief Interface expected by KDTreeSingleIndexAdaptor,
   *  return the number of data points
   */
-  size_t kdtree_get_point_count() const {
-    return data_->shape[0];
-  }
+  size_t kdtree_get_point_count() const { return data_->shape[0]; }

  /*!
   * \brief Interface expected by KDTreeSingleIndexAdaptor,
@@ -110,7 +109,7 @@ class KDTreeNDArrayAdapter {
  }

 private:
-  index_type* index_;    // The kd tree index
+  index_type* index_;   // The kd tree index
  const NDArray data_;  // data points
 };


--- a/src/graph/transform/cpu/knn.cc
+++ b/src/graph/transform/cpu/knn.cc
@@ -4,16 +4,19 @@
 * \brief k-nearest-neighbor (KNN) implementation
 */

-#include <dgl/runtime/device_api.h>
+#include "../knn.h"
+
 #include <dgl/random.h>
+#include <dgl/runtime/device_api.h>
 #include <dgl/runtime/parallel_for.h>
 #include <dmlc/omp.h>
-#include <vector>
-#include <tuple>
-#include <limits>
+
 #include <algorithm>
+#include <limits>
+#include <tuple>
+#include <vector>
+
 #include "kdtree_ndarray_adapter.h"
-#include "../knn.h"

 using namespace dgl::runtime;
 using namespace dgl::transform::knn_utils;
@@ -30,8 +33,9 @@ static constexpr int NN_DESCENT_BLOCK_SIZE = 16384;
 *  distance.
 */
 template <typename FloatType, typename IdType>
-FloatType EuclideanDistWithCheck(const FloatType* vec1, const FloatType* vec2, int64_t dim,
-                                 FloatType worst_dist = std::numeric_limits<FloatType>::max()) {
+FloatType EuclideanDistWithCheck(
+    const FloatType* vec1, const FloatType* vec2, int64_t dim,
+    FloatType worst_dist = std::numeric_limits<FloatType>::max()) {
  FloatType dist = 0;
  bool early_stop = false;

@@ -52,7 +56,8 @@ FloatType EuclideanDistWithCheck(const FloatType* vec1, const FloatType* vec2, i

 /*! \brief Compute Euclidean distance between two vectors */
 template <typename FloatType, typename IdType>
-FloatType EuclideanDist(const FloatType* vec1, const FloatType* vec2, int64_t dim) {
+FloatType EuclideanDist(
+    const FloatType* vec1, const FloatType* vec2, int64_t dim) {
  FloatType dist = 0;

  for (IdType idx = 0; idx < dim; ++idx) {
@@ -64,9 +69,9 @@ FloatType EuclideanDist(const FloatType* vec1, const FloatType* vec2, int64_t di

 /*! \brief Insert a new element into a heap */
 template <typename FloatType, typename IdType>
-void HeapInsert(IdType* out, FloatType* dist,
-                IdType new_id, FloatType new_dist,
-                int k, bool check_repeat = false) {
+void HeapInsert(
+    IdType* out, FloatType* dist, IdType new_id, FloatType new_dist, int k,
+    bool check_repeat = false) {
  if (new_dist > dist[0]) return;

  // check if we have it
@@ -99,11 +104,12 @@ void HeapInsert(IdType* out, FloatType* dist,
  }
 }

-/*! \brief Insert a new element and its flag into heap, return 1 if insert successfully */
+/*! \brief Insert a new element and its flag into heap, return 1 if insert
+ * successfully */
 template <typename FloatType, typename IdType>
-int FlaggedHeapInsert(IdType* out, FloatType* dist, bool* flag,
-                      IdType new_id, FloatType new_dist, bool new_flag,
-                      int k, bool check_repeat = false) {
+int FlaggedHeapInsert(
+    IdType* out, FloatType* dist, bool* flag, IdType new_id, FloatType new_dist,
+    bool new_flag, int k, bool check_repeat = false) {
  if (new_dist > dist[0]) return 0;

  if (check_repeat) {
@@ -170,16 +176,15 @@ void BuildHeap(IdType* index, FloatType* dist, int k) {
 *  distance of these two points, we update the neighborhood of that point.
 */
 template <typename FloatType, typename IdType>
-int UpdateNeighbors(IdType* neighbors, FloatType* dists, const FloatType* points,
-                    bool* flags, IdType c1, IdType c2, IdType point_start,
-                    int64_t feature_size, int k) {
+int UpdateNeighbors(
+    IdType* neighbors, FloatType* dists, const FloatType* points, bool* flags,
+    IdType c1, IdType c2, IdType point_start, int64_t feature_size, int k) {
  IdType c1_local = c1 - point_start, c2_local = c2 - point_start;
  FloatType worst_c1_dist = dists[c1_local * k];
  FloatType worst_c2_dist = dists[c2_local * k];
  FloatType new_dist = EuclideanDistWithCheck<FloatType, IdType>(
-    points + c1 * feature_size,
-    points + c2 * feature_size,
-    feature_size, std::max(worst_c1_dist, worst_c2_dist));
+      points + c1 * feature_size, points + c2 * feature_size, feature_size,
+      std::max(worst_c1_dist, worst_c2_dist));

  int num_updates = 0;
  if (new_dist < worst_c1_dist) {
@@ -187,10 +192,8 @@ int UpdateNeighbors(IdType* neighbors, FloatType* dists, const FloatType* points
 #pragma omp critical
    {
      FlaggedHeapInsert<FloatType, IdType>(
-        neighbors + c1 * k,
-        dists + c1_local * k,
-        flags + c1_local * k,
-        c2, new_dist, true, k, true);
+          neighbors + c1 * k, dists + c1_local * k, flags + c1_local * k, c2,
+          new_dist, true, k, true);
    }
  }
  if (new_dist < worst_c2_dist) {
@@ -198,10 +201,8 @@ int UpdateNeighbors(IdType* neighbors, FloatType* dists, const FloatType* points
 #pragma omp critical
    {
      FlaggedHeapInsert<FloatType, IdType>(
-        neighbors + c2 * k,
-        dists + c2_local * k,
-        flags + c2_local * k,
-        c1, new_dist, true, k, true);
+          neighbors + c2 * k, dists + c2_local * k, flags + c2_local * k, c1,
+          new_dist, true, k, true);
    }
  }
  return num_updates;
@@ -209,9 +210,10 @@ int UpdateNeighbors(IdType* neighbors, FloatType* dists, const FloatType* points

 /*! \brief The kd-tree implementation of K-Nearest Neighbors */
 template <typename FloatType, typename IdType>
-void KdTreeKNN(const NDArray& data_points, const IdArray& data_offsets,
-               const NDArray& query_points, const IdArray& query_offsets,
-               const int k, IdArray result) {
+void KdTreeKNN(
+    const NDArray& data_points, const IdArray& data_offsets,
+    const NDArray& query_points, const IdArray& query_offsets, const int k,
+    IdArray result) {
  const int64_t batch_size = data_offsets->shape[0] - 1;
  const int64_t feature_size = data_points->shape[1];
  const IdType* data_offsets_data = data_offsets.Ptr<IdType>();
@@ -228,11 +230,16 @@ void KdTreeKNN(const NDArray& data_points, const IdArray& data_offsets,
    auto out_offset = k * q_offset;

    // create view for each segment
-    const NDArray current_data_points = const_cast<NDArray*>(&data_points)->CreateView(
-      {d_length, feature_size}, data_points->dtype, d_offset * feature_size * sizeof(FloatType));
-    const FloatType* current_query_pts_data = query_points_data + q_offset * feature_size;
+    const NDArray current_data_points =
+        const_cast<NDArray*>(&data_points)
+            ->CreateView(
+                {d_length, feature_size}, data_points->dtype,
+                d_offset * feature_size * sizeof(FloatType));
+    const FloatType* current_query_pts_data =
+        query_points_data + q_offset * feature_size;

-    KDTreeNDArrayAdapter<FloatType, IdType> kdtree(feature_size, current_data_points);
+    KDTreeNDArrayAdapter<FloatType, IdType> kdtree(
+        feature_size, current_data_points);

    // query
    parallel_for(0, q_length, [&](IdType b, IdType e) {
@@ -256,9 +263,10 @@ void KdTreeKNN(const NDArray& data_points, const IdArray& data_offsets,
 }

 template <typename FloatType, typename IdType>
-void BruteForceKNN(const NDArray& data_points, const IdArray& data_offsets,
-                   const NDArray& query_points, const IdArray& query_offsets,
-                   const int k, IdArray result) {
+void BruteForceKNN(
+    const NDArray& data_points, const IdArray& data_offsets,
+    const NDArray& query_points, const IdArray& query_offsets, const int k,
+    IdArray result) {
  const int64_t batch_size = data_offsets->shape[0] - 1;
  const int64_t feature_size = data_points->shape[1];
  const IdType* data_offsets_data = data_offsets.Ptr<IdType>();
@@ -285,9 +293,9 @@ void BruteForceKNN(const NDArray& data_points, const IdArray& data_offsets,

        for (IdType d_idx = d_start; d_idx < d_end; ++d_idx) {
          FloatType tmp_dist = EuclideanDistWithCheck<FloatType, IdType>(
-            query_points_data + q_idx * feature_size,
-            data_points_data + d_idx * feature_size,
-            feature_size, worst_dist);
+              query_points_data + q_idx * feature_size,
+              data_points_data + d_idx * feature_size, feature_size,
+              worst_dist);

          if (tmp_dist == std::numeric_limits<FloatType>::max()) {
            continue;
@@ -295,7 +303,7 @@ void BruteForceKNN(const NDArray& data_points, const IdArray& data_offsets,

          IdType out_offset = q_idx * k;
          HeapInsert<FloatType, IdType>(
-            data_out + out_offset, dist_buffer.data(), d_idx, tmp_dist, k);
+              data_out + out_offset, dist_buffer.data(), d_idx, tmp_dist, k);
          worst_dist = dist_buffer[0];
        }
      }
@@ -305,25 +313,27 @@ void BruteForceKNN(const NDArray& data_points, const IdArray& data_offsets,
 }  // namespace impl

 template <DGLDeviceType XPU, typename FloatType, typename IdType>
-void KNN(const NDArray& data_points, const IdArray& data_offsets,
-         const NDArray& query_points, const IdArray& query_offsets,
-         const int k, IdArray result, const std::string& algorithm) {
+void KNN(
+    const NDArray& data_points, const IdArray& data_offsets,
+    const NDArray& query_points, const IdArray& query_offsets, const int k,
+    IdArray result, const std::string& algorithm) {
  if (algorithm == std::string("kd-tree")) {
    impl::KdTreeKNN<FloatType, IdType>(
-      data_points, data_offsets, query_points, query_offsets, k, result);
+        data_points, data_offsets, query_points, query_offsets, k, result);
  } else if (algorithm == std::string("bruteforce")) {
    impl::BruteForceKNN<FloatType, IdType>(
-      data_points, data_offsets, query_points, query_offsets, k, result);
+        data_points, data_offsets, query_points, query_offsets, k, result);
  } else {
    LOG(FATAL) << "Algorithm " << algorithm << " is not supported on CPU";
  }
 }

 template <DGLDeviceType XPU, typename FloatType, typename IdType>
-void NNDescent(const NDArray& points, const IdArray& offsets,
-               IdArray result, const int k, const int num_iters,
-               const int num_candidates, const double delta) {
-  using nnd_updates_t = std::vector<std::vector<std::tuple<IdType, IdType, FloatType>>>;
+void NNDescent(
+    const NDArray& points, const IdArray& offsets, IdArray result, const int k,
+    const int num_iters, const int num_candidates, const double delta) {
+  using nnd_updates_t =
+      std::vector<std::vector<std::tuple<IdType, IdType, FloatType>>>;
  const auto& ctx = points->ctx;
  auto device = runtime::DeviceAPI::Get(ctx);
  const int64_t num_nodes = points->shape[0];
@@ -343,62 +353,69 @@ void NNDescent(const NDArray& points, const IdArray& offsets,
  }

  // allocate memory for candidate, sampling pool, distance and flag
-  IdType* new_candidates = static_cast<IdType*>(
-    device->AllocWorkspace(ctx, max_segment_size * num_candidates * sizeof(IdType)));
-  IdType* old_candidates = static_cast<IdType*>(
-    device->AllocWorkspace(ctx, max_segment_size * num_candidates * sizeof(IdType)));
-  FloatType* new_candidates_dists = static_cast<FloatType*>(
-    device->AllocWorkspace(ctx, max_segment_size * num_candidates * sizeof(FloatType)));
-  FloatType* old_candidates_dists = static_cast<FloatType*>(
-    device->AllocWorkspace(ctx, max_segment_size * num_candidates * sizeof(FloatType)));
+  IdType* new_candidates = static_cast<IdType*>(device->AllocWorkspace(
+      ctx, max_segment_size * num_candidates * sizeof(IdType)));
+  IdType* old_candidates = static_cast<IdType*>(device->AllocWorkspace(
+      ctx, max_segment_size * num_candidates * sizeof(IdType)));
+  FloatType* new_candidates_dists =
+      static_cast<FloatType*>(device->AllocWorkspace(
+          ctx, max_segment_size * num_candidates * sizeof(FloatType)));
+  FloatType* old_candidates_dists =
+      static_cast<FloatType*>(device->AllocWorkspace(
+          ctx, max_segment_size * num_candidates * sizeof(FloatType)));
  FloatType* neighbors_dists = static_cast<FloatType*>(
-    device->AllocWorkspace(ctx, max_segment_size * k * sizeof(FloatType)));
+      device->AllocWorkspace(ctx, max_segment_size * k * sizeof(FloatType)));
  bool* flags = static_cast<bool*>(
-    device->AllocWorkspace(ctx, max_segment_size * k * sizeof(bool)));
+      device->AllocWorkspace(ctx, max_segment_size * k * sizeof(bool)));

  for (IdType b = 0; b < batch_size; ++b) {
-    IdType point_idx_start = offsets_data[b], point_idx_end = offsets_data[b + 1];
+    IdType point_idx_start = offsets_data[b],
+           point_idx_end = offsets_data[b + 1];
    IdType segment_size = point_idx_end - point_idx_start;

    // random initialization
-    runtime::parallel_for(point_idx_start, point_idx_end, [&](size_t b, size_t e) {
-      for (auto i = b; i < e; ++i) {
-        IdType local_idx = i - point_idx_start;
-
-        dgl::RandomEngine::ThreadLocal()->UniformChoice<IdType>(
-          k, segment_size, neighbors + i * k, false);
-
-        for (IdType n = 0; n < k; ++n) {
-          central_nodes[i * k + n] = i;
-          neighbors[i * k + n] += point_idx_start;
-          flags[local_idx * k + n] = true;
-          neighbors_dists[local_idx * k + n] = impl::EuclideanDist<FloatType, IdType>(
-            points_data + i * feature_size,
-            points_data + neighbors[i * k + n] * feature_size,
-            feature_size);
-        }
-        impl::BuildHeap<FloatType, IdType>(neighbors + i * k, neighbors_dists + local_idx * k, k);
-      }
-    });
+    runtime::parallel_for(
+        point_idx_start, point_idx_end, [&](size_t b, size_t e) {
+          for (auto i = b; i < e; ++i) {
+            IdType local_idx = i - point_idx_start;
+
+            dgl::RandomEngine::ThreadLocal()->UniformChoice<IdType>(
+                k, segment_size, neighbors + i * k, false);
+
+            for (IdType n = 0; n < k; ++n) {
+              central_nodes[i * k + n] = i;
+              neighbors[i * k + n] += point_idx_start;
+              flags[local_idx * k + n] = true;
+              neighbors_dists[local_idx * k + n] =
+                  impl::EuclideanDist<FloatType, IdType>(
+                      points_data + i * feature_size,
+                      points_data + neighbors[i * k + n] * feature_size,
+                      feature_size);
+            }
+            impl::BuildHeap<FloatType, IdType>(
+                neighbors + i * k, neighbors_dists + local_idx * k, k);
+          }
+        });

    size_t num_updates = 0;
    for (int iter = 0; iter < num_iters; ++iter) {
      num_updates = 0;

      // initialize candidates array as empty value
-      runtime::parallel_for(point_idx_start, point_idx_end, [&](size_t b, size_t e) {
-        for (auto i = b; i < e; ++i) {
-          IdType local_idx = i - point_idx_start;
-          for (IdType c = 0; c < num_candidates; ++c) {
-            new_candidates[local_idx * num_candidates + c] = num_nodes;
-            old_candidates[local_idx * num_candidates + c] = num_nodes;
-            new_candidates_dists[local_idx * num_candidates + c] =
-              std::numeric_limits<FloatType>::max();
-            old_candidates_dists[local_idx * num_candidates + c] =
-              std::numeric_limits<FloatType>::max();
-          }
-        }
-      });
+      runtime::parallel_for(
+          point_idx_start, point_idx_end, [&](size_t b, size_t e) {
+            for (auto i = b; i < e; ++i) {
+              IdType local_idx = i - point_idx_start;
+              for (IdType c = 0; c < num_candidates; ++c) {
+                new_candidates[local_idx * num_candidates + c] = num_nodes;
+                old_candidates[local_idx * num_candidates + c] = num_nodes;
+                new_candidates_dists[local_idx * num_candidates + c] =
+                    std::numeric_limits<FloatType>::max();
+                old_candidates_dists[local_idx * num_candidates + c] =
+                    std::numeric_limits<FloatType>::max();
+              }
+            }
+          });

      // randomly select neighbors as candidates
      int num_threads = omp_get_max_threads();
@@ -410,33 +427,36 @@ void NNDescent(const NDArray& points, const IdArray& offsets,
              IdType neighbor_idx = neighbors[i * k + n];
              bool is_new = flags[local_idx * k + n];
              IdType local_neighbor_idx = neighbor_idx - point_idx_start;
-              FloatType random_dist = dgl::RandomEngine::ThreadLocal()->Uniform<FloatType>();
+              FloatType random_dist =
+                  dgl::RandomEngine::ThreadLocal()->Uniform<FloatType>();

              if (is_new) {
                if (local_idx % num_threads == tid) {
                  impl::HeapInsert<FloatType, IdType>(
-                    new_candidates + local_idx * num_candidates,
-                    new_candidates_dists + local_idx * num_candidates,
-                    neighbor_idx, random_dist, num_candidates, true);
+                      new_candidates + local_idx * num_candidates,
+                      new_candidates_dists + local_idx * num_candidates,
+                      neighbor_idx, random_dist, num_candidates, true);
                }
                if (local_neighbor_idx % num_threads == tid) {
                  impl::HeapInsert<FloatType, IdType>(
-                    new_candidates + local_neighbor_idx * num_candidates,
-                    new_candidates_dists + local_neighbor_idx * num_candidates,
-                    i, random_dist, num_candidates, true);
+                      new_candidates + local_neighbor_idx * num_candidates,
+                      new_candidates_dists +
+                          local_neighbor_idx * num_candidates,
+                      i, random_dist, num_candidates, true);
                }
              } else {
                if (local_idx % num_threads == tid) {
                  impl::HeapInsert<FloatType, IdType>(
-                    old_candidates + local_idx * num_candidates,
-                    old_candidates_dists + local_idx * num_candidates,
-                    neighbor_idx, random_dist, num_candidates, true);
+                      old_candidates + local_idx * num_candidates,
+                      old_candidates_dists + local_idx * num_candidates,
+                      neighbor_idx, random_dist, num_candidates, true);
                }
                if (local_neighbor_idx % num_threads == tid) {
                  impl::HeapInsert<FloatType, IdType>(
-                    old_candidates + local_neighbor_idx * num_candidates,
-                    old_candidates_dists + local_neighbor_idx * num_candidates,
-                    i, random_dist, num_candidates, true);
+                      old_candidates + local_neighbor_idx * num_candidates,
+                      old_candidates_dists +
+                          local_neighbor_idx * num_candidates,
+                      i, random_dist, num_candidates, true);
                }
              }
            }
@@ -445,27 +465,28 @@ void NNDescent(const NDArray& points, const IdArray& offsets,
      });

      // mark all elements in new_candidates as false
-      runtime::parallel_for(point_idx_start, point_idx_end, [&](size_t b, size_t e) {
-        for (auto i = b; i < e; ++i) {
-          IdType local_idx = i - point_idx_start;
-          for (IdType n = 0; n < k; ++n) {
-            IdType n_idx = neighbors[i * k + n];
-
-            for (IdType c = 0; c < num_candidates; ++c) {
-              if (new_candidates[local_idx * num_candidates + c] == n_idx) {
-                flags[local_idx * k + n] = false;
-                break;
+      runtime::parallel_for(
+          point_idx_start, point_idx_end, [&](size_t b, size_t e) {
+            for (auto i = b; i < e; ++i) {
+              IdType local_idx = i - point_idx_start;
+              for (IdType n = 0; n < k; ++n) {
+                IdType n_idx = neighbors[i * k + n];
+
+                for (IdType c = 0; c < num_candidates; ++c) {
+                  if (new_candidates[local_idx * num_candidates + c] == n_idx) {
+                    flags[local_idx * k + n] = false;
+                    break;
+                  }
+                }
              }
            }
-          }
-        }
-      });
+          });

      // update neighbors block by block
-      for (IdType block_start = point_idx_start;
-           block_start < point_idx_end;
+      for (IdType block_start = point_idx_start; block_start < point_idx_end;
           block_start += impl::NN_DESCENT_BLOCK_SIZE) {
-        IdType block_end = std::min(point_idx_end, block_start + impl::NN_DESCENT_BLOCK_SIZE);
+        IdType block_end =
+            std::min(point_idx_end, block_start + impl::NN_DESCENT_BLOCK_SIZE);
        IdType block_size = block_end - block_start;
        nnd_updates_t updates(block_size);

@@ -487,14 +508,15 @@ void NNDescent(const NDArray& points, const IdArray& offsets,

                FloatType worst_c1_dist = neighbors_dists[c1_local * k];
                FloatType worst_c2_dist = neighbors_dists[c2_local * k];
-                FloatType new_dist = impl::EuclideanDistWithCheck<FloatType, IdType>(
-                  points_data + new_c1 * feature_size,
-                  points_data + new_c2 * feature_size,
-                  feature_size,
-                  std::max(worst_c1_dist, worst_c2_dist));
+                FloatType new_dist =
+                    impl::EuclideanDistWithCheck<FloatType, IdType>(
+                        points_data + new_c1 * feature_size,
+                        points_data + new_c2 * feature_size, feature_size,
+                        std::max(worst_c1_dist, worst_c2_dist));

                if (new_dist < worst_c1_dist || new_dist < worst_c2_dist) {
-                  updates[i - block_start].push_back(std::make_tuple(new_c1, new_c2, new_dist));
+                  updates[i - block_start].push_back(
+                      std::make_tuple(new_c1, new_c2, new_dist));
                }
              }

@@ -506,14 +528,15 @@ void NNDescent(const NDArray& points, const IdArray& offsets,

                FloatType worst_c1_dist = neighbors_dists[c1_local * k];
                FloatType worst_c2_dist = neighbors_dists[c2_local * k];
-                FloatType new_dist = impl::EuclideanDistWithCheck<FloatType, IdType>(
-                  points_data + new_c1 * feature_size,
-                  points_data + old_c2 * feature_size,
-                  feature_size,
-                  std::max(worst_c1_dist, worst_c2_dist));
+                FloatType new_dist =
+                    impl::EuclideanDistWithCheck<FloatType, IdType>(
+                        points_data + new_c1 * feature_size,
+                        points_data + old_c2 * feature_size, feature_size,
+                        std::max(worst_c1_dist, worst_c2_dist));

                if (new_dist < worst_c1_dist || new_dist < worst_c2_dist) {
-                  updates[i - block_start].push_back(std::make_tuple(new_c1, old_c2, new_dist));
+                  updates[i - block_start].push_back(
+                      std::make_tuple(new_c1, old_c2, new_dist));
                }
              }
            }
@@ -521,12 +544,12 @@ void NNDescent(const NDArray& points, const IdArray& offsets,
        });

        int tid;
-#pragma omp parallel private(tid, num_threads) reduction(+:num_updates)
+#pragma omp parallel private(tid, num_threads) reduction(+ : num_updates)
        {
          tid = omp_get_thread_num();
          num_threads = omp_get_num_threads();
          for (IdType i = 0; i < block_size; ++i) {
-            for (const auto & u : updates[i]) {
+            for (const auto& u : updates[i]) {
              IdType p1, p2;
              FloatType d;
              std::tie(p1, p2, d) = u;
@@ -535,17 +558,13 @@ void NNDescent(const NDArray& points, const IdArray& offsets,

              if (p1 % num_threads == tid) {
                num_updates += impl::FlaggedHeapInsert<FloatType, IdType>(
-                  neighbors + p1 * k,
-                  neighbors_dists + p1_local * k,
-                  flags + p1_local * k,
-                  p2, d, true, k, true);
+                    neighbors + p1 * k, neighbors_dists + p1_local * k,
+                    flags + p1_local * k, p2, d, true, k, true);
              }
              if (p2 % num_threads == tid) {
                num_updates += impl::FlaggedHeapInsert<FloatType, IdType>(
-                  neighbors + p2 * k,
-                  neighbors_dists + p2_local * k,
-                  flags + p2_local * k,
-                  p1, d, true, k, true);
+                    neighbors + p2 * k, neighbors_dists + p2_local * k,
+                    flags + p2_local * k, p1, d, true, k, true);
              }
            }
          }
@@ -568,37 +587,33 @@ void NNDescent(const NDArray& points, const IdArray& offsets,
 }

 template void KNN<kDGLCPU, float, int32_t>(
-  const NDArray& data_points, const IdArray& data_offsets,
-  const NDArray& query_points, const IdArray& query_offsets,
-  const int k, IdArray result, const std::string& algorithm);
+    const NDArray& data_points, const IdArray& data_offsets,
+    const NDArray& query_points, const IdArray& query_offsets, const int k,
+    IdArray result, const std::string& algorithm);
 template void KNN<kDGLCPU, float, int64_t>(
-  const NDArray& data_points, const IdArray& data_offsets,
-  const NDArray& query_points, const IdArray& query_offsets,
-  const int k, IdArray result, const std::string& algorithm);
+    const NDArray& data_points, const IdArray& data_offsets,
+    const NDArray& query_points, const IdArray& query_offsets, const int k,
+    IdArray result, const std::string& algorithm);
 template void KNN<kDGLCPU, double, int32_t>(
-  const NDArray& data_points, const IdArray& data_offsets,
-  const NDArray& query_points, const IdArray& query_offsets,
-  const int k, IdArray result, const std::string& algorithm);
+    const NDArray& data_points, const IdArray& data_offsets,
+    const NDArray& query_points, const IdArray& query_offsets, const int k,
+    IdArray result, const std::string& algorithm);
 template void KNN<kDGLCPU, double, int64_t>(
-  const NDArray& data_points, const IdArray& data_offsets,
-  const NDArray& query_points, const IdArray& query_offsets,
-  const int k, IdArray result, const std::string& algorithm);
+    const NDArray& data_points, const IdArray& data_offsets,
+    const NDArray& query_points, const IdArray& query_offsets, const int k,
+    IdArray result, const std::string& algorithm);

 template void NNDescent<kDGLCPU, float, int32_t>(
-  const NDArray& points, const IdArray& offsets,
-  IdArray result, const int k, const int num_iters,
-  const int num_candidates, const double delta);
+    const NDArray& points, const IdArray& offsets, IdArray result, const int k,
+    const int num_iters, const int num_candidates, const double delta);
 template void NNDescent<kDGLCPU, float, int64_t>(
-  const NDArray& points, const IdArray& offsets,
-  IdArray result, const int k, const int num_iters,
-  const int num_candidates, const double delta);
+    const NDArray& points, const IdArray& offsets, IdArray result, const int k,
+    const int num_iters, const int num_candidates, const double delta);
 template void NNDescent<kDGLCPU, double, int32_t>(
-  const NDArray& points, const IdArray& offsets,
-  IdArray result, const int k, const int num_iters,
-  const int num_candidates, const double delta);
+    const NDArray& points, const IdArray& offsets, IdArray result, const int k,
+    const int num_iters, const int num_candidates, const double delta);
 template void NNDescent<kDGLCPU, double, int64_t>(
-  const NDArray& points, const IdArray& offsets,
-  IdArray result, const int k, const int num_iters,
-  const int num_candidates, const double delta);
+    const NDArray& points, const IdArray& offsets, IdArray result, const int k,
+    const int num_iters, const int num_candidates, const double delta);
 }  // namespace transform
 }  // namespace dgl
--- a/src/graph/transform/cuda/cuda_compact_graph.cu
+++ b/src/graph/transform/cuda/cuda_compact_graph.cu
@@ -18,13 +18,13 @@
 * all given graphs with the same set of nodes.
 */

-
-#include <dgl/runtime/device_api.h>
-#include <dgl/immutable_graph.h>
 #include <cuda_runtime.h>
-#include <utility>
+#include <dgl/immutable_graph.h>
+#include <dgl/runtime/device_api.h>
+
 #include <algorithm>
 #include <memory>
+#include <utility>

 #include "../../../runtime/cuda/cuda_common.h"
 #include "../../heterograph.h"
@@ -41,54 +41,45 @@ namespace transform {
 namespace {

 /**
-* \brief This function builds node maps for each node type, preserving the
-* order of the input nodes. Here it is assumed the nodes are not unique,
-* and thus a unique list is generated.
-*
-* \param input_nodes The set of input nodes.
-* \param node_maps The node maps to be constructed.
-* \param count_unique_device The number of unique nodes (on the GPU).
-* \param unique_nodes_device The unique nodes (on the GPU).
-* \param stream The stream to operate on.
-*/
-template<typename IdType>
+ * \brief This function builds node maps for each node type, preserving the
+ * order of the input nodes. Here it is assumed the nodes are not unique,
+ * and thus a unique list is generated.
+ *
+ * \param input_nodes The set of input nodes.
+ * \param node_maps The node maps to be constructed.
+ * \param count_unique_device The number of unique nodes (on the GPU).
+ * \param unique_nodes_device The unique nodes (on the GPU).
+ * \param stream The stream to operate on.
+ */
+template <typename IdType>
 void BuildNodeMaps(
-    const std::vector<IdArray>& input_nodes,
-    DeviceNodeMap<IdType> * const node_maps,
-    int64_t * const count_unique_device,
-    std::vector<IdArray>* const unique_nodes_device,
-    cudaStream_t stream) {
+    const std::vector<IdArray> &input_nodes,
+    DeviceNodeMap<IdType> *const node_maps, int64_t *const count_unique_device,
+    std::vector<IdArray> *const unique_nodes_device, cudaStream_t stream) {
  const int64_t num_ntypes = static_cast<int64_t>(input_nodes.size());

  CUDA_CALL(cudaMemsetAsync(
-    count_unique_device,
-    0,
-    num_ntypes*sizeof(*count_unique_device),
-    stream));
+      count_unique_device, 0, num_ntypes * sizeof(*count_unique_device),
+      stream));

  // possibly duplicated nodes
  for (int64_t ntype = 0; ntype < num_ntypes; ++ntype) {
-    const IdArray& nodes = input_nodes[ntype];
+    const IdArray &nodes = input_nodes[ntype];
    if (nodes->shape[0] > 0) {
      CHECK_EQ(nodes->ctx.device_type, kDGLCUDA);
      node_maps->LhsHashTable(ntype).FillWithDuplicates(
-          nodes.Ptr<IdType>(),
-          nodes->shape[0],
+          nodes.Ptr<IdType>(), nodes->shape[0],
          (*unique_nodes_device)[ntype].Ptr<IdType>(),
-          count_unique_device+ntype,
-          stream);
+          count_unique_device + ntype, stream);
    }
  }
 }

-
-template<typename IdType>
-std::pair<std::vector<HeteroGraphPtr>, std::vector<IdArray>>
-CompactGraphsGPU(
+template <typename IdType>
+std::pair<std::vector<HeteroGraphPtr>, std::vector<IdArray>> CompactGraphsGPU(
    const std::vector<HeteroGraphPtr> &graphs,
    const std::vector<IdArray> &always_preserve) {
-
-  const auto& ctx = graphs[0]->Context();
+  const auto &ctx = graphs[0]->Context();
  auto device = runtime::DeviceAPI::Get(ctx);
  cudaStream_t stream = runtime::getCurrentCUDAStream();

@@ -96,7 +87,8 @@ CompactGraphsGPU(

  // Step 1: Collect the nodes that has connections for each type.
  const uint64_t num_ntypes = graphs[0]->NumVertexTypes();
-  std::vector<std::vector<EdgeArray>> all_edges(graphs.size());   // all_edges[i][etype]
+  std::vector<std::vector<EdgeArray>> all_edges(
+      graphs.size());  // all_edges[i][etype]

  // count the number of nodes per type
  std::vector<int64_t> max_vertex_cnt(num_ntypes, 0);
@@ -123,19 +115,18 @@ CompactGraphsGPU(
  std::vector<int64_t> node_offsets(num_ntypes, 0);

  for (uint64_t ntype = 0; ntype < num_ntypes; ++ntype) {
-    all_nodes[ntype] = NewIdArray(max_vertex_cnt[ntype], ctx,
-      sizeof(IdType)*8);
+    all_nodes[ntype] =
+        NewIdArray(max_vertex_cnt[ntype], ctx, sizeof(IdType) * 8);
    // copy the nodes in always_preserve
-    if (ntype < always_preserve.size() && always_preserve[ntype]->shape[0] > 0) {
+    if (ntype < always_preserve.size() &&
+        always_preserve[ntype]->shape[0] > 0) {
      device->CopyDataFromTo(
          always_preserve[ntype].Ptr<IdType>(), 0,
-          all_nodes[ntype].Ptr<IdType>(),
-          node_offsets[ntype],
-          sizeof(IdType)*always_preserve[ntype]->shape[0],
-          always_preserve[ntype]->ctx,
-          all_nodes[ntype]->ctx,
+          all_nodes[ntype].Ptr<IdType>(), node_offsets[ntype],
+          sizeof(IdType) * always_preserve[ntype]->shape[0],
+          always_preserve[ntype]->ctx, all_nodes[ntype]->ctx,
          always_preserve[ntype]->dtype);
-      node_offsets[ntype] += sizeof(IdType)*always_preserve[ntype]->shape[0];
+      node_offsets[ntype] += sizeof(IdType) * always_preserve[ntype]->shape[0];
    }
  }

@@ -152,25 +143,17 @@ CompactGraphsGPU(

      if (edges.src.defined()) {
        device->CopyDataFromTo(
-            edges.src.Ptr<IdType>(), 0,
-            all_nodes[srctype].Ptr<IdType>(),
-            node_offsets[srctype],
-            sizeof(IdType)*edges.src->shape[0],
-            edges.src->ctx,
-            all_nodes[srctype]->ctx,
-            edges.src->dtype);
-        node_offsets[srctype] += sizeof(IdType)*edges.src->shape[0];
+            edges.src.Ptr<IdType>(), 0, all_nodes[srctype].Ptr<IdType>(),
+            node_offsets[srctype], sizeof(IdType) * edges.src->shape[0],
+            edges.src->ctx, all_nodes[srctype]->ctx, edges.src->dtype);
+        node_offsets[srctype] += sizeof(IdType) * edges.src->shape[0];
      }
      if (edges.dst.defined()) {
        device->CopyDataFromTo(
-            edges.dst.Ptr<IdType>(), 0,
-            all_nodes[dsttype].Ptr<IdType>(),
-            node_offsets[dsttype],
-            sizeof(IdType)*edges.dst->shape[0],
-            edges.dst->ctx,
-            all_nodes[dsttype]->ctx,
-            edges.dst->dtype);
-        node_offsets[dsttype] += sizeof(IdType)*edges.dst->shape[0];
+            edges.dst.Ptr<IdType>(), 0, all_nodes[dsttype].Ptr<IdType>(),
+            node_offsets[dsttype], sizeof(IdType) * edges.dst->shape[0],
+            edges.dst->ctx, all_nodes[dsttype]->ctx, edges.dst->dtype);
+        node_offsets[dsttype] += sizeof(IdType) * edges.dst->shape[0];
      }
      all_edges[i].push_back(edges);
    }
@@ -185,29 +168,22 @@ CompactGraphsGPU(
  // number of unique nodes per type on CPU
  std::vector<int64_t> num_induced_nodes(num_ntypes);
  // number of unique nodes per type on GPU
-  int64_t * count_unique_device = static_cast<int64_t*>(
-      device->AllocWorkspace(ctx, sizeof(int64_t)*num_ntypes));
+  int64_t *count_unique_device = static_cast<int64_t *>(
+      device->AllocWorkspace(ctx, sizeof(int64_t) * num_ntypes));
  // the set of unique nodes per type
  std::vector<IdArray> induced_nodes(num_ntypes);
  for (uint64_t ntype = 0; ntype < num_ntypes; ++ntype) {
-    induced_nodes[ntype] = NewIdArray(max_vertex_cnt[ntype], ctx,
-      sizeof(IdType)*8);
+    induced_nodes[ntype] =
+        NewIdArray(max_vertex_cnt[ntype], ctx, sizeof(IdType) * 8);
  }

  BuildNodeMaps(
-    all_nodes,
-    &node_maps,
-    count_unique_device,
-    &induced_nodes,
-    stream);
+      all_nodes, &node_maps, count_unique_device, &induced_nodes, stream);

  device->CopyDataFromTo(
-    count_unique_device, 0,
-    num_induced_nodes.data(), 0,
-    sizeof(*num_induced_nodes.data())*num_ntypes,
-    ctx,
-    DGLContext{kDGLCPU, 0},
-    DGLDataType{kDGLInt, 64, 1});
+      count_unique_device, 0, num_induced_nodes.data(), 0,
+      sizeof(*num_induced_nodes.data()) * num_ntypes, ctx,
+      DGLContext{kDGLCPU, 0}, DGLDataType{kDGLInt, 64, 1});
  device->StreamSync(ctx, stream);

  // wait for the node counts to finish transferring
@@ -230,22 +206,20 @@ CompactGraphsGPU(

    std::vector<IdArray> new_src;
    std::vector<IdArray> new_dst;
-    std::tie(new_src, new_dst) = MapEdges(
-      curr_graph, all_edges[i], node_maps, stream);
+    std::tie(new_src, new_dst) =
+        MapEdges(curr_graph, all_edges[i], node_maps, stream);

    for (IdType etype = 0; etype < num_etypes; ++etype) {
      IdType srctype, dsttype;
      std::tie(srctype, dsttype) = curr_graph->GetEndpointTypes(etype);

      rel_graphs.push_back(UnitGraph::CreateFromCOO(
-          srctype == dsttype ? 1 : 2,
-          induced_nodes[srctype]->shape[0],
-          induced_nodes[dsttype]->shape[0],
-          new_src[etype],
-          new_dst[etype]));
+          srctype == dsttype ? 1 : 2, induced_nodes[srctype]->shape[0],
+          induced_nodes[dsttype]->shape[0], new_src[etype], new_dst[etype]));
    }

-    new_graphs.push_back(CreateHeteroGraph(meta_graph, rel_graphs, num_induced_nodes));
+    new_graphs.push_back(
+        CreateHeteroGraph(meta_graph, rel_graphs, num_induced_nodes));
  }

  return std::make_pair(new_graphs, induced_nodes);
@@ -253,7 +227,7 @@ CompactGraphsGPU(

 }  // namespace

-template<>
+template <>
 std::pair<std::vector<HeteroGraphPtr>, std::vector<IdArray>>
 CompactGraphs<kDGLCUDA, int32_t>(
    const std::vector<HeteroGraphPtr> &graphs,
@@ -261,7 +235,7 @@ CompactGraphs<kDGLCUDA, int32_t>(
  return CompactGraphsGPU<int32_t>(graphs, always_preserve);
 }

-template<>
+template <>
 std::pair<std::vector<HeteroGraphPtr>, std::vector<IdArray>>
 CompactGraphs<kDGLCUDA, int64_t>(
    const std::vector<HeteroGraphPtr> &graphs,

--- a/src/graph/transform/cuda/cuda_map_edges.cuh
+++ b/src/graph/transform/cuda/cuda_map_edges.cuh
@@ -20,13 +20,14 @@
 #ifndef DGL_GRAPH_TRANSFORM_CUDA_CUDA_MAP_EDGES_CUH_
 #define DGL_GRAPH_TRANSFORM_CUDA_CUDA_MAP_EDGES_CUH_

-#include <dgl/runtime/c_runtime_api.h>
 #include <cuda_runtime.h>
+#include <dgl/runtime/c_runtime_api.h>
+
 #include <algorithm>
 #include <memory>
 #include <tuple>
-#include <vector>
 #include <utility>
+#include <vector>

 #include "../../../runtime/cuda/cuda_common.h"
 #include "../../../runtime/cuda/cuda_hashtable.cuh"
@@ -39,48 +40,46 @@ namespace transform {

 namespace cuda {

-template<typename IdType, int BLOCK_SIZE, IdType TILE_SIZE>
+template <typename IdType, int BLOCK_SIZE, IdType TILE_SIZE>
 __device__ void map_vertex_ids(
-    const IdType * const global,
-    IdType * const new_global,
-    const IdType num_vertices,
-    const DeviceOrderedHashTable<IdType>& table) {
+    const IdType* const global, IdType* const new_global,
+    const IdType num_vertices, const DeviceOrderedHashTable<IdType>& table) {
  assert(BLOCK_SIZE == blockDim.x);

  using Mapping = typename OrderedHashTable<IdType>::Mapping;

-  const IdType tile_start = TILE_SIZE*blockIdx.x;
-  const IdType tile_end = min(TILE_SIZE*(blockIdx.x+1), num_vertices);
+  const IdType tile_start = TILE_SIZE * blockIdx.x;
+  const IdType tile_end = min(TILE_SIZE * (blockIdx.x + 1), num_vertices);

-  for (IdType idx = threadIdx.x+tile_start; idx < tile_end; idx+=BLOCK_SIZE) {
+  for (IdType idx = threadIdx.x + tile_start; idx < tile_end;
+       idx += BLOCK_SIZE) {
    const Mapping& mapping = *table.Search(global[idx]);
    new_global[idx] = mapping.local;
  }
 }

 /**
-* \brief Generate mapped edge endpoint ids.
-*
-* \tparam IdType The type of id.
-* \tparam BLOCK_SIZE The size of each thread block.
-* \tparam TILE_SIZE The number of edges to process per thread block.
-* \param global_srcs_device The source ids to map.
-* \param new_global_srcs_device The mapped source ids (output).
-* \param global_dsts_device The destination ids to map.
-* \param new_global_dsts_device The mapped destination ids (output).
-* \param num_edges The number of edges to map.
-* \param src_mapping The mapping of sources ids.
-* \param src_hash_size The the size of source id hash table/mapping.
-* \param dst_mapping The mapping of destination ids.
-* \param dst_hash_size The the size of destination id hash table/mapping.
-*/
-template<typename IdType, int BLOCK_SIZE, IdType TILE_SIZE>
+ * \brief Generate mapped edge endpoint ids.
+ *
+ * \tparam IdType The type of id.
+ * \tparam BLOCK_SIZE The size of each thread block.
+ * \tparam TILE_SIZE The number of edges to process per thread block.
+ * \param global_srcs_device The source ids to map.
+ * \param new_global_srcs_device The mapped source ids (output).
+ * \param global_dsts_device The destination ids to map.
+ * \param new_global_dsts_device The mapped destination ids (output).
+ * \param num_edges The number of edges to map.
+ * \param src_mapping The mapping of sources ids.
+ * \param src_hash_size The the size of source id hash table/mapping.
+ * \param dst_mapping The mapping of destination ids.
+ * \param dst_hash_size The the size of destination id hash table/mapping.
+ */
+template <typename IdType, int BLOCK_SIZE, IdType TILE_SIZE>
 __global__ void map_edge_ids(
-    const IdType * const global_srcs_device,
-    IdType * const new_global_srcs_device,
-    const IdType * const global_dsts_device,
-    IdType * const new_global_dsts_device,
-    const IdType num_edges,
+    const IdType* const global_srcs_device,
+    IdType* const new_global_srcs_device,
+    const IdType* const global_dsts_device,
+    IdType* const new_global_dsts_device, const IdType num_edges,
    DeviceOrderedHashTable<IdType> src_mapping,
    DeviceOrderedHashTable<IdType> dst_mapping) {
  assert(BLOCK_SIZE == blockDim.x);
@@ -88,87 +87,67 @@ __global__ void map_edge_ids(

  if (blockIdx.y == 0) {
    map_vertex_ids<IdType, BLOCK_SIZE, TILE_SIZE>(
-        global_srcs_device,
-        new_global_srcs_device,
-        num_edges,
-        src_mapping);
+        global_srcs_device, new_global_srcs_device, num_edges, src_mapping);
  } else {
    map_vertex_ids<IdType, BLOCK_SIZE, TILE_SIZE>(
-        global_dsts_device,
-        new_global_dsts_device,
-        num_edges,
-        dst_mapping);
+        global_dsts_device, new_global_dsts_device, num_edges, dst_mapping);
  }
 }

 /**
-* \brief Device level node maps for each node type.
-*
-* \param num_nodes Number of nodes per type.
-* \param offset When offset is set to 0, LhsHashTable is identical to RhsHashTable. 
-* Or set to num_nodes.size()/2 to use seperated LhsHashTable and RhsHashTable.
-* \param ctx The DGL context.
-* \param stream The stream to operate on.
-*/
-template<typename IdType>
+ * \brief Device level node maps for each node type.
+ *
+ * \param num_nodes Number of nodes per type.
+ * \param offset When offset is set to 0, LhsHashTable is identical to
+ *        RhsHashTable. Or set to num_nodes.size()/2 to use seperated
+ *        LhsHashTable and RhsHashTable.
+ * \param ctx The DGL context.
+ * \param stream The stream to operate on.
+ */
+template <typename IdType>
 class DeviceNodeMap {
 public:
  using Mapping = typename OrderedHashTable<IdType>::Mapping;

  DeviceNodeMap(
-      const std::vector<int64_t>& num_nodes,
-      const int64_t offset,
-      DGLContext ctx,
-      cudaStream_t stream) :
-    num_types_(num_nodes.size()),
-    rhs_offset_(offset),
-    hash_tables_(),
-    ctx_(ctx) {
+      const std::vector<int64_t>& num_nodes, const int64_t offset,
+      DGLContext ctx, cudaStream_t stream)
+      : num_types_(num_nodes.size()),
+        rhs_offset_(offset),
+        hash_tables_(),
+        ctx_(ctx) {
    auto device = runtime::DeviceAPI::Get(ctx);

    hash_tables_.reserve(num_types_);
    for (int64_t i = 0; i < num_types_; ++i) {
      hash_tables_.emplace_back(
-          new OrderedHashTable<IdType>(
-            num_nodes[i],
-            ctx_,
-            stream));
+          new OrderedHashTable<IdType>(num_nodes[i], ctx_, stream));
    }
  }

-  OrderedHashTable<IdType>& LhsHashTable(
-      const size_t index) {
+  OrderedHashTable<IdType>& LhsHashTable(const size_t index) {
    return HashData(index);
  }

-  OrderedHashTable<IdType>& RhsHashTable(
-      const size_t index) {
-    return HashData(index+rhs_offset_);
+  OrderedHashTable<IdType>& RhsHashTable(const size_t index) {
+    return HashData(index + rhs_offset_);
  }

-  const OrderedHashTable<IdType>& LhsHashTable(
-      const size_t index) const {
+  const OrderedHashTable<IdType>& LhsHashTable(const size_t index) const {
    return HashData(index);
  }

-  const OrderedHashTable<IdType>& RhsHashTable(
-      const size_t index) const {
-    return HashData(index+rhs_offset_);
+  const OrderedHashTable<IdType>& RhsHashTable(const size_t index) const {
+    return HashData(index + rhs_offset_);
  }

-  IdType LhsHashSize(
-      const size_t index) const {
-    return HashSize(index);
-  }
+  IdType LhsHashSize(const size_t index) const { return HashSize(index); }

-  IdType RhsHashSize(
-      const size_t index) const {
-    return HashSize(rhs_offset_+index);
+  IdType RhsHashSize(const size_t index) const {
+    return HashSize(rhs_offset_ + index);
  }

-  size_t Size() const {
-    return hash_tables_.size();
-  }
+  size_t Size() const { return hash_tables_.size(); }

 private:
  int64_t num_types_;
@@ -176,45 +155,35 @@ class DeviceNodeMap {
  std::vector<std::unique_ptr<OrderedHashTable<IdType>>> hash_tables_;
  DGLContext ctx_;

-  inline OrderedHashTable<IdType>& HashData(
-      const size_t index) {
+  inline OrderedHashTable<IdType>& HashData(const size_t index) {
    CHECK_LT(index, hash_tables_.size());
    return *hash_tables_[index];
  }

-  inline const OrderedHashTable<IdType>& HashData(
-      const size_t index) const {
+  inline const OrderedHashTable<IdType>& HashData(const size_t index) const {
    CHECK_LT(index, hash_tables_.size());
    return *hash_tables_[index];
  }

-  inline IdType HashSize(
-      const size_t index) const {
+  inline IdType HashSize(const size_t index) const {
    return HashData(index).size();
  }
 };

-template<typename IdType>
-inline size_t RoundUpDiv(
-    const IdType num,
-    const size_t divisor) {
-  return static_cast<IdType>(num/divisor) + (num % divisor == 0 ? 0 : 1);
+template <typename IdType>
+inline size_t RoundUpDiv(const IdType num, const size_t divisor) {
+  return static_cast<IdType>(num / divisor) + (num % divisor == 0 ? 0 : 1);
 }

-template<typename IdType>
-inline IdType RoundUp(
-    const IdType num,
-    const size_t unit) {
-  return RoundUpDiv(num, unit)*unit;
+template <typename IdType>
+inline IdType RoundUp(const IdType num, const size_t unit) {
+  return RoundUpDiv(num, unit) * unit;
 }

-template<typename IdType>
-std::tuple<std::vector<IdArray>, std::vector<IdArray>>
-MapEdges(
-    HeteroGraphPtr graph,
-    const std::vector<EdgeArray>& edge_sets,
-    const DeviceNodeMap<IdType>& node_map,
-    cudaStream_t stream) {
+template <typename IdType>
+std::tuple<std::vector<IdArray>, std::vector<IdArray>> MapEdges(
+    HeteroGraphPtr graph, const std::vector<EdgeArray>& edge_sets,
+    const DeviceNodeMap<IdType>& node_map, cudaStream_t stream) {
  constexpr const int BLOCK_SIZE = 128;
  constexpr const size_t TILE_SIZE = 1024;

@@ -233,8 +202,8 @@ MapEdges(
    if (edges.id.defined() && edges.src->shape[0] > 0) {
      const int64_t num_edges = edges.src->shape[0];

-      new_lhs.emplace_back(NewIdArray(num_edges, ctx, sizeof(IdType)*8));
-      new_rhs.emplace_back(NewIdArray(num_edges, ctx, sizeof(IdType)*8));
+      new_lhs.emplace_back(NewIdArray(num_edges, ctx, sizeof(IdType) * 8));
+      new_rhs.emplace_back(NewIdArray(num_edges, ctx, sizeof(IdType) * 8));

      const auto src_dst_types = graph->GetEndpointTypes(etype);
      const int src_type = src_dst_types.first;
@@ -244,20 +213,17 @@ MapEdges(
      const dim3 block(BLOCK_SIZE);

      // map the srcs
-      CUDA_KERNEL_CALL((map_edge_ids<IdType, BLOCK_SIZE, TILE_SIZE>),
-        grid, block, 0, stream,
-        edges.src.Ptr<IdType>(),
-        new_lhs.back().Ptr<IdType>(),
-        edges.dst.Ptr<IdType>(),
-        new_rhs.back().Ptr<IdType>(),
-        num_edges,
-        node_map.LhsHashTable(src_type).DeviceHandle(),
-        node_map.RhsHashTable(dst_type).DeviceHandle());
+      CUDA_KERNEL_CALL(
+          (map_edge_ids<IdType, BLOCK_SIZE, TILE_SIZE>), grid, block, 0, stream,
+          edges.src.Ptr<IdType>(), new_lhs.back().Ptr<IdType>(),
+          edges.dst.Ptr<IdType>(), new_rhs.back().Ptr<IdType>(), num_edges,
+          node_map.LhsHashTable(src_type).DeviceHandle(),
+          node_map.RhsHashTable(dst_type).DeviceHandle());
    } else {
      new_lhs.emplace_back(
-          aten::NullArray(DGLDataType{kDGLInt, sizeof(IdType)*8, 1}, ctx));
+          aten::NullArray(DGLDataType{kDGLInt, sizeof(IdType) * 8, 1}, ctx));
      new_rhs.emplace_back(
-          aten::NullArray(DGLDataType{kDGLInt, sizeof(IdType)*8, 1}, ctx));
+          aten::NullArray(DGLDataType{kDGLInt, sizeof(IdType) * 8, 1}, ctx));
    }
  }

@@ -265,7 +231,6 @@ MapEdges(
      std::move(new_lhs), std::move(new_rhs));
 }

-
 }  // namespace cuda
 }  // namespace transform
 }  // namespace dgl

--- a/src/graph/transform/cuda/cuda_to_block.cu
+++ b/src/graph/transform/cuda/cuda_to_block.cu
@@ -18,13 +18,13 @@
 * ids.
 */

-
-#include <dgl/runtime/device_api.h>
-#include <dgl/immutable_graph.h>
 #include <cuda_runtime.h>
-#include <utility>
+#include <dgl/immutable_graph.h>
+#include <dgl/runtime/device_api.h>
+
 #include <algorithm>
 #include <memory>
+#include <utility>

 #include "../../../runtime/cuda/cuda_common.h"
 #include "../../heterograph.h"
@@ -40,42 +40,36 @@ namespace transform {

 namespace {

-template<typename IdType>
+template <typename IdType>
 class DeviceNodeMapMaker {
 public:
-  explicit DeviceNodeMapMaker(
-      const std::vector<int64_t>& maxNodesPerType) :
-      max_num_nodes_(0) {
-    max_num_nodes_ = *std::max_element(maxNodesPerType.begin(),
-        maxNodesPerType.end());
+  explicit DeviceNodeMapMaker(const std::vector<int64_t>& maxNodesPerType)
+      : max_num_nodes_(0) {
+    max_num_nodes_ =
+        *std::max_element(maxNodesPerType.begin(), maxNodesPerType.end());
  }

  /**
-  * \brief This function builds node maps for each node type, preserving the
-  * order of the input nodes. Here it is assumed the lhs_nodes are not unique,
-  * and thus a unique list is generated.
-  *
-  * \param lhs_nodes The set of source input nodes.
-  * \param rhs_nodes The set of destination input nodes.
-  * \param node_maps The node maps to be constructed.
-  * \param count_lhs_device The number of unique source nodes (on the GPU).
-  * \param lhs_device The unique source nodes (on the GPU).
-  * \param stream The stream to operate on.
-  */
+   * \brief This function builds node maps for each node type, preserving the
+   * order of the input nodes. Here it is assumed the lhs_nodes are not unique,
+   * and thus a unique list is generated.
+   *
+   * \param lhs_nodes The set of source input nodes.
+   * \param rhs_nodes The set of destination input nodes.
+   * \param node_maps The node maps to be constructed.
+   * \param count_lhs_device The number of unique source nodes (on the GPU).
+   * \param lhs_device The unique source nodes (on the GPU).
+   * \param stream The stream to operate on.
+   */
  void Make(
      const std::vector<IdArray>& lhs_nodes,
      const std::vector<IdArray>& rhs_nodes,
-      DeviceNodeMap<IdType> * const node_maps,
-      int64_t * const count_lhs_device,
-      std::vector<IdArray>* const lhs_device,
-      cudaStream_t stream) {
+      DeviceNodeMap<IdType>* const node_maps, int64_t* const count_lhs_device,
+      std::vector<IdArray>* const lhs_device, cudaStream_t stream) {
    const int64_t num_ntypes = lhs_nodes.size() + rhs_nodes.size();

    CUDA_CALL(cudaMemsetAsync(
-      count_lhs_device,
-      0,
-      num_ntypes*sizeof(*count_lhs_device),
-      stream));
+        count_lhs_device, 0, num_ntypes * sizeof(*count_lhs_device), stream));

    // possibly dublicate lhs nodes
    const int64_t lhs_num_ntypes = static_cast<int64_t>(lhs_nodes.size());
@@ -84,10 +78,8 @@ class DeviceNodeMapMaker {
      if (nodes->shape[0] > 0) {
        CHECK_EQ(nodes->ctx.device_type, kDGLCUDA);
        node_maps->LhsHashTable(ntype).FillWithDuplicates(
-            nodes.Ptr<IdType>(),
-            nodes->shape[0],
-            (*lhs_device)[ntype].Ptr<IdType>(),
-            count_lhs_device+ntype,
+            nodes.Ptr<IdType>(), nodes->shape[0],
+            (*lhs_device)[ntype].Ptr<IdType>(), count_lhs_device + ntype,
            stream);
      }
    }
@@ -98,28 +90,25 @@ class DeviceNodeMapMaker {
      const IdArray& nodes = rhs_nodes[ntype];
      if (nodes->shape[0] > 0) {
        node_maps->RhsHashTable(ntype).FillWithUnique(
-            nodes.Ptr<IdType>(),
-            nodes->shape[0],
-            stream);
+            nodes.Ptr<IdType>(), nodes->shape[0], stream);
      }
    }
  }

  /**
-  * \brief This function builds node maps for each node type, preserving the
-  * order of the input nodes. Here it is assumed both lhs_nodes and rhs_nodes
-  * are unique.
-  *
-  * \param lhs_nodes The set of source input nodes.
-  * \param rhs_nodes The set of destination input nodes.
-  * \param node_maps The node maps to be constructed.
-  * \param stream The stream to operate on.
-  */
+   * \brief This function builds node maps for each node type, preserving the
+   * order of the input nodes. Here it is assumed both lhs_nodes and rhs_nodes
+   * are unique.
+   *
+   * \param lhs_nodes The set of source input nodes.
+   * \param rhs_nodes The set of destination input nodes.
+   * \param node_maps The node maps to be constructed.
+   * \param stream The stream to operate on.
+   */
  void Make(
      const std::vector<IdArray>& lhs_nodes,
      const std::vector<IdArray>& rhs_nodes,
-      DeviceNodeMap<IdType> * const node_maps,
-      cudaStream_t stream) {
+      DeviceNodeMap<IdType>* const node_maps, cudaStream_t stream) {
    const int64_t num_ntypes = lhs_nodes.size() + rhs_nodes.size();

    // unique lhs nodes
@@ -129,9 +118,7 @@ class DeviceNodeMapMaker {
      if (nodes->shape[0] > 0) {
        CHECK_EQ(nodes->ctx.device_type, kDGLCUDA);
        node_maps->LhsHashTable(ntype).FillWithUnique(
-            nodes.Ptr<IdType>(),
-            nodes->shape[0],
-            stream);
+            nodes.Ptr<IdType>(), nodes->shape[0], stream);
      }
    }

@@ -141,9 +128,7 @@ class DeviceNodeMapMaker {
      const IdArray& nodes = rhs_nodes[ntype];
      if (nodes->shape[0] > 0) {
        node_maps->RhsHashTable(ntype).FillWithUnique(
-            nodes.Ptr<IdType>(),
-            nodes->shape[0],
-            stream);
+            nodes.Ptr<IdType>(), nodes->shape[0], stream);
      }
    }
  }
@@ -152,20 +137,15 @@ class DeviceNodeMapMaker {
  IdType max_num_nodes_;
 };

-
 // Since partial specialization is not allowed for functions, use this as an
 // intermediate for ToBlock where XPU = kDGLCUDA.
-template<typename IdType>
-std::tuple<HeteroGraphPtr, std::vector<IdArray>>
-ToBlockGPU(
-    HeteroGraphPtr graph,
-    const std::vector<IdArray> &rhs_nodes,
-    const bool include_rhs_in_lhs,
-    std::vector<IdArray>* const lhs_nodes_ptr) {
+template <typename IdType>
+std::tuple<HeteroGraphPtr, std::vector<IdArray>> ToBlockGPU(
+    HeteroGraphPtr graph, const std::vector<IdArray>& rhs_nodes,
+    const bool include_rhs_in_lhs, std::vector<IdArray>* const lhs_nodes_ptr) {
  std::vector<IdArray>& lhs_nodes = *lhs_nodes_ptr;
  const bool generate_lhs_nodes = lhs_nodes.empty();

-
  const auto& ctx = graph->Context();
  auto device = runtime::DeviceAPI::Get(ctx);
  cudaStream_t stream = runtime::getCurrentCUDAStream();
@@ -176,16 +156,17 @@ ToBlockGPU(
  }

  // Since DST nodes are included in SRC nodes, a common requirement is to fetch
-  // the DST node features from the SRC nodes features. To avoid expensive sparse lookup,
-  // the function assures that the DST nodes in both SRC and DST sets have the same ids.
-  // As a result, given the node feature tensor ``X`` of type ``utype``,
-  // the following code finds the corresponding DST node features of type ``vtype``:
+  // the DST node features from the SRC nodes features. To avoid expensive
+  // sparse lookup, the function assures that the DST nodes in both SRC and DST
+  // sets have the same ids. As a result, given the node feature tensor ``X`` of
+  // type ``utype``, the following code finds the corresponding DST node
+  // features of type ``vtype``:

  const int64_t num_etypes = graph->NumEdgeTypes();
  const int64_t num_ntypes = graph->NumVertexTypes();

  CHECK(rhs_nodes.size() == static_cast<size_t>(num_ntypes))
-    << "rhs_nodes not given for every node type";
+      << "rhs_nodes not given for every node type";

  std::vector<EdgeArray> edge_arrays(num_etypes);
  for (int64_t etype = 0; etype < num_etypes; ++etype) {
@@ -197,9 +178,9 @@ ToBlockGPU(
  }

  // count lhs and rhs nodes
-  std::vector<int64_t> maxNodesPerType(num_ntypes*2, 0);
+  std::vector<int64_t> maxNodesPerType(num_ntypes * 2, 0);
  for (int64_t ntype = 0; ntype < num_ntypes; ++ntype) {
-    maxNodesPerType[ntype+num_ntypes] += rhs_nodes[ntype]->shape[0];
+    maxNodesPerType[ntype + num_ntypes] += rhs_nodes[ntype]->shape[0];

    if (generate_lhs_nodes) {
      if (include_rhs_in_lhs) {
@@ -226,16 +207,16 @@ ToBlockGPU(
  if (generate_lhs_nodes) {
    std::vector<int64_t> src_node_offsets(num_ntypes, 0);
    for (int64_t ntype = 0; ntype < num_ntypes; ++ntype) {
-      src_nodes[ntype] = NewIdArray(maxNodesPerType[ntype], ctx,
-          sizeof(IdType)*8);
+      src_nodes[ntype] =
+          NewIdArray(maxNodesPerType[ntype], ctx, sizeof(IdType) * 8);
      if (include_rhs_in_lhs) {
        // place rhs nodes first
-        device->CopyDataFromTo(rhs_nodes[ntype].Ptr<IdType>(), 0,
-            src_nodes[ntype].Ptr<IdType>(), src_node_offsets[ntype],
-            sizeof(IdType)*rhs_nodes[ntype]->shape[0],
-            rhs_nodes[ntype]->ctx, src_nodes[ntype]->ctx,
-            rhs_nodes[ntype]->dtype);
-        src_node_offsets[ntype] += sizeof(IdType)*rhs_nodes[ntype]->shape[0];
+        device->CopyDataFromTo(
+            rhs_nodes[ntype].Ptr<IdType>(), 0, src_nodes[ntype].Ptr<IdType>(),
+            src_node_offsets[ntype],
+            sizeof(IdType) * rhs_nodes[ntype]->shape[0], rhs_nodes[ntype]->ctx,
+            src_nodes[ntype]->ctx, rhs_nodes[ntype]->dtype);
+        src_node_offsets[ntype] += sizeof(IdType) * rhs_nodes[ntype]->shape[0];
      }
    }
    for (int64_t etype = 0; etype < num_etypes; ++etype) {
@@ -244,14 +225,13 @@ ToBlockGPU(
      if (edge_arrays[etype].src.defined()) {
        device->CopyDataFromTo(
            edge_arrays[etype].src.Ptr<IdType>(), 0,
-            src_nodes[srctype].Ptr<IdType>(),
-            src_node_offsets[srctype],
-            sizeof(IdType)*edge_arrays[etype].src->shape[0],
-            rhs_nodes[srctype]->ctx,
-            src_nodes[srctype]->ctx,
+            src_nodes[srctype].Ptr<IdType>(), src_node_offsets[srctype],
+            sizeof(IdType) * edge_arrays[etype].src->shape[0],
+            rhs_nodes[srctype]->ctx, src_nodes[srctype]->ctx,
            rhs_nodes[srctype]->dtype);

-        src_node_offsets[srctype] += sizeof(IdType)*edge_arrays[etype].src->shape[0];
+        src_node_offsets[srctype] +=
+            sizeof(IdType) * edge_arrays[etype].src->shape[0];
      }
    }
  } else {
@@ -267,47 +247,35 @@ ToBlockGPU(
  if (generate_lhs_nodes) {
    lhs_nodes.reserve(num_ntypes);
    for (int64_t ntype = 0; ntype < num_ntypes; ++ntype) {
-      lhs_nodes.emplace_back(NewIdArray(
-          maxNodesPerType[ntype], ctx, sizeof(IdType)*8));
+      lhs_nodes.emplace_back(
+          NewIdArray(maxNodesPerType[ntype], ctx, sizeof(IdType) * 8));
    }
  }

-  std::vector<int64_t> num_nodes_per_type(num_ntypes*2);
+  std::vector<int64_t> num_nodes_per_type(num_ntypes * 2);
  // populate RHS nodes from what we already know
  for (int64_t ntype = 0; ntype < num_ntypes; ++ntype) {
-    num_nodes_per_type[num_ntypes+ntype] = rhs_nodes[ntype]->shape[0];
+    num_nodes_per_type[num_ntypes + ntype] = rhs_nodes[ntype]->shape[0];
  }

  // populate the mappings
  if (generate_lhs_nodes) {
-    int64_t * count_lhs_device = static_cast<int64_t*>(
-        device->AllocWorkspace(ctx, sizeof(int64_t)*num_ntypes*2));
+    int64_t* count_lhs_device = static_cast<int64_t*>(
+        device->AllocWorkspace(ctx, sizeof(int64_t) * num_ntypes * 2));

    maker.Make(
-        src_nodes,
-        rhs_nodes,
-        &node_maps,
-        count_lhs_device,
-        &lhs_nodes,
-        stream);
+        src_nodes, rhs_nodes, &node_maps, count_lhs_device, &lhs_nodes, stream);

    device->CopyDataFromTo(
-        count_lhs_device, 0,
-        num_nodes_per_type.data(), 0,
-        sizeof(*num_nodes_per_type.data())*num_ntypes,
-        ctx,
-        DGLContext{kDGLCPU, 0},
-        DGLDataType{kDGLInt, 64, 1});
+        count_lhs_device, 0, num_nodes_per_type.data(), 0,
+        sizeof(*num_nodes_per_type.data()) * num_ntypes, ctx,
+        DGLContext{kDGLCPU, 0}, DGLDataType{kDGLInt, 64, 1});
    device->StreamSync(ctx, stream);

    // wait for the node counts to finish transferring
    device->FreeWorkspace(ctx, count_lhs_device);
  } else {
-    maker.Make(
-        lhs_nodes,
-        rhs_nodes,
-        &node_maps,
-        stream);
+    maker.Make(lhs_nodes, rhs_nodes, &node_maps, stream);

    for (int64_t ntype = 0; ntype < num_ntypes; ++ntype) {
      num_nodes_per_type[ntype] = lhs_nodes[ntype]->shape[0];
@@ -321,7 +289,7 @@ ToBlockGPU(
      induced_edges.push_back(edge_arrays[etype].id);
    } else {
      induced_edges.push_back(
-            aten::NullArray(DGLDataType{kDGLInt, sizeof(IdType)*8, 1}, ctx));
+          aten::NullArray(DGLDataType{kDGLInt, sizeof(IdType) * 8, 1}, ctx));
    }
  }

@@ -329,8 +297,8 @@ ToBlockGPU(
  const auto meta_graph = graph->meta_graph();
  const EdgeArray etypes = meta_graph->Edges("eid");
  const IdArray new_dst = Add(etypes.dst, num_ntypes);
-  const auto new_meta_graph = ImmutableGraph::CreateFromCOO(
-      num_ntypes * 2, etypes.src, new_dst);
+  const auto new_meta_graph =
+      ImmutableGraph::CreateFromCOO(num_ntypes * 2, etypes.src, new_dst);

  // allocate vector for graph relations while GPU is busy
  std::vector<HeteroGraphPtr> rel_graphs;
@@ -358,20 +326,17 @@ ToBlockGPU(
      // No rhs nodes are given for this edge type. Create an empty graph.
      rel_graphs.push_back(CreateFromCOO(
          2, lhs_nodes[srctype]->shape[0], rhs_nodes[dsttype]->shape[0],
-          aten::NullArray(DGLDataType{kDGLInt, sizeof(IdType)*8, 1}, ctx),
-          aten::NullArray(DGLDataType{kDGLInt, sizeof(IdType)*8, 1}, ctx)));
+          aten::NullArray(DGLDataType{kDGLInt, sizeof(IdType) * 8, 1}, ctx),
+          aten::NullArray(DGLDataType{kDGLInt, sizeof(IdType) * 8, 1}, ctx)));
    } else {
      rel_graphs.push_back(CreateFromCOO(
-          2,
-          lhs_nodes[srctype]->shape[0],
-          rhs_nodes[dsttype]->shape[0],
-          new_lhs[etype],
-          new_rhs[etype]));
+          2, lhs_nodes[srctype]->shape[0], rhs_nodes[dsttype]->shape[0],
+          new_lhs[etype], new_rhs[etype]));
    }
  }

-  HeteroGraphPtr new_graph = CreateHeteroGraph(
-      new_meta_graph, rel_graphs, num_nodes_per_type);
+  HeteroGraphPtr new_graph =
+      CreateHeteroGraph(new_meta_graph, rel_graphs, num_nodes_per_type);

  // return the new graph, the new src nodes, and new edges
  return std::make_tuple(new_graph, induced_edges);
@@ -379,26 +344,22 @@ ToBlockGPU(

 }  // namespace

-// Use explicit names to get around MSVC's broken mangling that thinks the following two
-// functions are the same.
-// Using template<> fails to export the symbols.
+// Use explicit names to get around MSVC's broken mangling that thinks the
+// following two functions are the same. Using template<> fails to export the
+// symbols.
 std::tuple<HeteroGraphPtr, std::vector<IdArray>>
 // ToBlock<kDGLCUDA, int32_t>
 ToBlockGPU32(
-    HeteroGraphPtr graph,
-    const std::vector<IdArray> &rhs_nodes,
-    bool include_rhs_in_lhs,
-    std::vector<IdArray>* const lhs_nodes) {
+    HeteroGraphPtr graph, const std::vector<IdArray>& rhs_nodes,
+    bool include_rhs_in_lhs, std::vector<IdArray>* const lhs_nodes) {
  return ToBlockGPU<int32_t>(graph, rhs_nodes, include_rhs_in_lhs, lhs_nodes);
 }

 std::tuple<HeteroGraphPtr, std::vector<IdArray>>
 // ToBlock<kDGLCUDA, int64_t>
 ToBlockGPU64(
-    HeteroGraphPtr graph,
-    const std::vector<IdArray> &rhs_nodes,
-    bool include_rhs_in_lhs,
-    std::vector<IdArray>* const lhs_nodes) {
+    HeteroGraphPtr graph, const std::vector<IdArray>& rhs_nodes,
+    bool include_rhs_in_lhs, std::vector<IdArray>* const lhs_nodes) {
  return ToBlockGPU<int64_t>(graph, rhs_nodes, include_rhs_in_lhs, lhs_nodes);
 }


--- a/src/graph/transform/cuda/knn.cu
+++ b/src/graph/transform/cuda/knn.cu
@@ -4,17 +4,19 @@
 * \brief k-nearest-neighbor (KNN) implementation (cuda)
 */

+#include <curand_kernel.h>
 #include <dgl/array.h>
 #include <dgl/random.h>
 #include <dgl/runtime/device_api.h>
-#include <curand_kernel.h>
+
 #include <algorithm>
+#include <limits>
 #include <string>
 #include <vector>
-#include <limits>
+
 #include "../../../array/cuda/dgl_cub.cuh"
-#include "../../../runtime/cuda/cuda_common.h"
 #include "../../../array/cuda/utils.h"
+#include "../../../runtime/cuda/cuda_common.h"
 #include "../knn.h"

 namespace dgl {
@@ -26,12 +28,12 @@ namespace impl {
 */
 template <typename Type>
 struct SharedMemory {
-  __device__ inline operator Type* () {
+  __device__ inline operator Type*() {
    extern __shared__ int __smem[];
    return reinterpret_cast<Type*>(__smem);
  }

-  __device__ inline operator const Type* () const {
+  __device__ inline operator const Type*() const {
    extern __shared__ int __smem[];
    return reinterpret_cast<Type*>(__smem);
  }
@@ -41,12 +43,12 @@ struct SharedMemory {
 // access compile errors
 template <>
 struct SharedMemory<double> {
-  __device__ inline operator double* () {
+  __device__ inline operator double*() {
    extern __shared__ double __smem_d[];
    return reinterpret_cast<double*>(__smem_d);
  }

-  __device__ inline operator const double* () const {
+  __device__ inline operator const double*() const {
    extern __shared__ double __smem_d[];
    return reinterpret_cast<double*>(__smem_d);
  }
@@ -54,9 +56,8 @@ struct SharedMemory<double> {

 /*! \brief Compute Euclidean distance between two vectors in a cuda kernel */
 template <typename FloatType, typename IdType>
-__device__ FloatType EuclideanDist(const FloatType* vec1,
-                                   const FloatType* vec2,
-                                   const int64_t dim) {
+__device__ FloatType
+EuclideanDist(const FloatType* vec1, const FloatType* vec2, const int64_t dim) {
  FloatType dist = 0;
  IdType idx = 0;
  for (; idx < dim - 3; idx += 4) {
@@ -82,10 +83,9 @@ __device__ FloatType EuclideanDist(const FloatType* vec1,
 *  than the worst distance.
 */
 template <typename FloatType, typename IdType>
-__device__ FloatType EuclideanDistWithCheck(const FloatType* vec1,
-                                            const FloatType* vec2,
-                                            const int64_t dim,
-                                            const FloatType worst_dist) {
+__device__ FloatType EuclideanDistWithCheck(
+    const FloatType* vec1, const FloatType* vec2, const int64_t dim,
+    const FloatType worst_dist) {
  FloatType dist = 0;
  IdType idx = 0;
  bool early_stop = false;
@@ -151,9 +151,9 @@ __device__ void BuildHeap(IdType* indices, FloatType* dists, int size) {
 }

 template <typename FloatType, typename IdType>
-__device__ void HeapInsert(IdType* indices, FloatType* dist,
-                           IdType new_idx, FloatType new_dist,
-                           int size, bool check_repeat = false) {
+__device__ void HeapInsert(
+    IdType* indices, FloatType* dist, IdType new_idx, FloatType new_dist,
+    int size, bool check_repeat = false) {
  if (new_dist > dist[0]) return;

  // check if we have it
@@ -192,9 +192,9 @@ __device__ void HeapInsert(IdType* indices, FloatType* dist,
 }

 template <typename FloatType, typename IdType>
-__device__ bool FlaggedHeapInsert(IdType* indices, FloatType* dist, bool* flags,
-                                  IdType new_idx, FloatType new_dist, bool new_flag,
-                                  int size, bool check_repeat = false) {
+__device__ bool FlaggedHeapInsert(
+    IdType* indices, FloatType* dist, bool* flags, IdType new_idx,
+    FloatType new_dist, bool new_flag, int size, bool check_repeat = false) {
  if (new_dist > dist[0]) return false;

  // check if we have it
@@ -239,22 +239,26 @@ __device__ bool FlaggedHeapInsert(IdType* indices, FloatType* dist, bool* flags,
 }

 /*!
- * \brief Brute force kNN kernel. Compute distance for each pair of input points and get
- *  the result directly (without a distance matrix).
+ * \brief Brute force kNN kernel. Compute distance for each pair of input points
+ * and get the result directly (without a distance matrix).
 */
 template <typename FloatType, typename IdType>
-__global__ void BruteforceKnnKernel(const FloatType* data_points, const IdType* data_offsets,
-                                    const FloatType* query_points, const IdType* query_offsets,
-                                    const int k, FloatType* dists, IdType* query_out,
-                                    IdType* data_out, const int64_t num_batches,
-                                    const int64_t feature_size) {
+__global__ void BruteforceKnnKernel(
+    const FloatType* data_points, const IdType* data_offsets,
+    const FloatType* query_points, const IdType* query_offsets, const int k,
+    FloatType* dists, IdType* query_out, IdType* data_out,
+    const int64_t num_batches, const int64_t feature_size) {
  const IdType q_idx = blockIdx.x * blockDim.x + threadIdx.x;
  if (q_idx >= query_offsets[num_batches]) return;
  IdType batch_idx = 0;
  for (IdType b = 0; b < num_batches + 1; ++b) {
-    if (query_offsets[b] > q_idx) { batch_idx = b - 1; break; }
+    if (query_offsets[b] > q_idx) {
+      batch_idx = b - 1;
+      break;
+    }
  }
-  const IdType data_start = data_offsets[batch_idx], data_end = data_offsets[batch_idx + 1];
+  const IdType data_start = data_offsets[batch_idx],
+               data_end = data_offsets[batch_idx + 1];

  for (IdType k_idx = 0; k_idx < k; ++k_idx) {
    query_out[q_idx * k + k_idx] = q_idx;
@@ -264,12 +268,12 @@ __global__ void BruteforceKnnKernel(const FloatType* data_points, const IdType*

  for (IdType d_idx = data_start; d_idx < data_end; ++d_idx) {
    FloatType tmp_dist = EuclideanDistWithCheck<FloatType, IdType>(
-      query_points + q_idx * feature_size,
-      data_points + d_idx * feature_size,
-      feature_size, worst_dist);
+        query_points + q_idx * feature_size, data_points + d_idx * feature_size,
+        feature_size, worst_dist);

    IdType out_offset = q_idx * k;
-    HeapInsert<FloatType, IdType>(data_out + out_offset, dists + out_offset, d_idx, tmp_dist, k);
+    HeapInsert<FloatType, IdType>(
+        data_out + out_offset, dists + out_offset, d_idx, tmp_dist, k);
    worst_dist = dists[q_idx * k];
  }
 }
@@ -281,22 +285,19 @@ __global__ void BruteforceKnnKernel(const FloatType* data_points, const IdType*
 *  This kernel is faster when the dimension of input points is not large.
 */
 template <typename FloatType, typename IdType>
-__global__ void BruteforceKnnShareKernel(const FloatType* data_points,
-                                         const IdType* data_offsets,
-                                         const FloatType* query_points,
-                                         const IdType* query_offsets,
-                                         const IdType* block_batch_id,
-                                         const IdType* local_block_id,
-                                         const int k, FloatType* dists,
-                                         IdType* query_out, IdType* data_out,
-                                         const int64_t num_batches,
-                                         const int64_t feature_size) {
+__global__ void BruteforceKnnShareKernel(
+    const FloatType* data_points, const IdType* data_offsets,
+    const FloatType* query_points, const IdType* query_offsets,
+    const IdType* block_batch_id, const IdType* local_block_id, const int k,
+    FloatType* dists, IdType* query_out, IdType* data_out,
+    const int64_t num_batches, const int64_t feature_size) {
  const IdType block_idx = static_cast<IdType>(blockIdx.x);
  const IdType block_size = static_cast<IdType>(blockDim.x);
  const IdType batch_idx = block_batch_id[block_idx];
  const IdType local_bid = local_block_id[block_idx];
  const IdType query_start = query_offsets[batch_idx] + block_size * local_bid;
-  const IdType query_end = min(query_start + block_size, query_offsets[batch_idx + 1]);
+  const IdType query_end =
+      min(query_start + block_size, query_offsets[batch_idx + 1]);
  if (query_start >= query_end) return;
  const IdType query_idx = query_start + threadIdx.x;
  const IdType data_start = data_offsets[batch_idx];
@@ -318,17 +319,20 @@ __global__ void BruteforceKnnShareKernel(const FloatType* data_points,
  if (query_idx < query_end) {
    for (auto i = 0; i < feature_size; ++i) {
      // to avoid bank conflict, we use transpose here
-      query_buff[threadIdx.x + i * block_size] = query_points[query_idx * feature_size + i];
+      query_buff[threadIdx.x + i * block_size] =
+          query_points[query_idx * feature_size + i];
    }
  }

  // perform computation on each tile
-  for (auto tile_start = data_start; tile_start < data_end; tile_start += block_size) {
+  for (auto tile_start = data_start; tile_start < data_end;
+       tile_start += block_size) {
    // each thread load one data point into the shared memory
    IdType load_idx = tile_start + threadIdx.x;
    if (load_idx < data_end) {
      for (auto i = 0; i < feature_size; ++i) {
-        data_buff[threadIdx.x * feature_size + i] = data_points[load_idx * feature_size + i];
+        data_buff[threadIdx.x * feature_size + i] =
+            data_points[load_idx * feature_size + i];
      }
    }
    __syncthreads();
@@ -342,16 +346,20 @@ __global__ void BruteforceKnnShareKernel(const FloatType* data_points,
        IdType dim_idx = 0;

        for (; dim_idx < feature_size - 3; dim_idx += 4) {
-          FloatType diff0 = query_buff[threadIdx.x + block_size * (dim_idx)]
-            - data_buff[d_idx * feature_size + dim_idx];
-          FloatType diff1 = query_buff[threadIdx.x + block_size * (dim_idx + 1)]
-            - data_buff[d_idx * feature_size + dim_idx + 1];
-          FloatType diff2 = query_buff[threadIdx.x + block_size * (dim_idx + 2)]
-            - data_buff[d_idx * feature_size + dim_idx + 2];
-          FloatType diff3 = query_buff[threadIdx.x + block_size * (dim_idx + 3)]
-            - data_buff[d_idx * feature_size + dim_idx + 3];
-
-          tmp_dist += diff0 * diff0 + diff1 * diff1 + diff2 * diff2 + diff3 * diff3;
+          FloatType diff0 = query_buff[threadIdx.x + block_size * (dim_idx)] -
+                            data_buff[d_idx * feature_size + dim_idx];
+          FloatType diff1 =
+              query_buff[threadIdx.x + block_size * (dim_idx + 1)] -
+              data_buff[d_idx * feature_size + dim_idx + 1];
+          FloatType diff2 =
+              query_buff[threadIdx.x + block_size * (dim_idx + 2)] -
+              data_buff[d_idx * feature_size + dim_idx + 2];
+          FloatType diff3 =
+              query_buff[threadIdx.x + block_size * (dim_idx + 3)] -
+              data_buff[d_idx * feature_size + dim_idx + 3];
+
+          tmp_dist +=
+              diff0 * diff0 + diff1 * diff1 + diff2 * diff2 + diff3 * diff3;

          if (tmp_dist > worst_dist) {
            early_stop = true;
@@ -361,8 +369,9 @@ __global__ void BruteforceKnnShareKernel(const FloatType* data_points,
        }

        for (; dim_idx < feature_size; ++dim_idx) {
-          const FloatType diff = query_buff[threadIdx.x + dim_idx * block_size]
-            - data_buff[d_idx * feature_size + dim_idx];
+          const FloatType diff =
+              query_buff[threadIdx.x + dim_idx * block_size] -
+              data_buff[d_idx * feature_size + dim_idx];
          tmp_dist += diff * diff;

          if (tmp_dist > worst_dist) {
@@ -374,8 +383,8 @@ __global__ void BruteforceKnnShareKernel(const FloatType* data_points,
        if (early_stop) continue;

        HeapInsert<FloatType, IdType>(
-          res_buff + threadIdx.x * k, dist_buff + threadIdx.x * k,
-          d_idx + tile_start, tmp_dist, k);
+            res_buff + threadIdx.x * k, dist_buff + threadIdx.x * k,
+            d_idx + tile_start, tmp_dist, k);
        worst_dist = dist_buff[threadIdx.x * k];
      }
    }
@@ -393,9 +402,9 @@ __global__ void BruteforceKnnShareKernel(const FloatType* data_points,

 /*! \brief determine the number of blocks for each segment */
 template <typename IdType>
-__global__ void GetNumBlockPerSegment(const IdType* offsets, IdType* out,
-                                      const int64_t batch_size,
-                                      const int64_t block_size) {
+__global__ void GetNumBlockPerSegment(
+    const IdType* offsets, IdType* out, const int64_t batch_size,
+    const int64_t block_size) {
  const IdType idx = blockIdx.x * blockDim.x + threadIdx.x;
  if (idx < batch_size) {
    out[idx] = (offsets[idx + 1] - offsets[idx] - 1) / block_size + 1;
@@ -404,9 +413,9 @@ __global__ void GetNumBlockPerSegment(const IdType* offsets, IdType* out,

 /*! \brief Get the batch index and local index in segment for each block */
 template <typename IdType>
-__global__ void GetBlockInfo(const IdType* num_block_prefixsum,
-                             IdType* block_batch_id, IdType* local_block_id,
-                             size_t batch_size, size_t num_blocks) {
+__global__ void GetBlockInfo(
+    const IdType* num_block_prefixsum, IdType* block_batch_id,
+    IdType* local_block_id, size_t batch_size, size_t num_blocks) {
  const IdType idx = blockIdx.x * blockDim.x + threadIdx.x;
  IdType i = 0;

@@ -421,8 +430,8 @@ __global__ void GetBlockInfo(const IdType* num_block_prefixsum,
 }

 /*!
- * \brief Brute force kNN. Compute distance for each pair of input points and get
- *  the result directly (without a distance matrix).
+ * \brief Brute force kNN. Compute distance for each pair of input points and
+ * get the result directly (without a distance matrix).
 *
 * \tparam FloatType The type of input points.
 * \tparam IdType The type of id.
@@ -434,9 +443,10 @@ __global__ void GetBlockInfo(const IdType* num_block_prefixsum,
 * \param result output array
 */
 template <typename FloatType, typename IdType>
-void BruteForceKNNCuda(const NDArray& data_points, const IdArray& data_offsets,
-                       const NDArray& query_points, const IdArray& query_offsets,
-                       const int k, IdArray result) {
+void BruteForceKNNCuda(
+    const NDArray& data_points, const IdArray& data_offsets,
+    const NDArray& query_points, const IdArray& query_offsets, const int k,
+    IdArray result) {
  cudaStream_t stream = runtime::getCurrentCUDAStream();
  const auto& ctx = data_points->ctx;
  auto device = runtime::DeviceAPI::Get(ctx);
@@ -450,13 +460,14 @@ void BruteForceKNNCuda(const NDArray& data_points, const IdArray& data_offsets,
  IdType* data_out = query_out + k * query_points->shape[0];

  FloatType* dists = static_cast<FloatType*>(device->AllocWorkspace(
-    ctx, k * query_points->shape[0] * sizeof(FloatType)));
+      ctx, k * query_points->shape[0] * sizeof(FloatType)));

  const int64_t block_size = cuda::FindNumThreads(query_points->shape[0]);
  const int64_t num_blocks = (query_points->shape[0] - 1) / block_size + 1;
-  CUDA_KERNEL_CALL(BruteforceKnnKernel, num_blocks, block_size, 0, stream,
-    data_points_data, data_offsets_data, query_points_data, query_offsets_data,
-    k, dists, query_out, data_out, batch_size, feature_size);
+  CUDA_KERNEL_CALL(
+      BruteforceKnnKernel, num_blocks, block_size, 0, stream, data_points_data,
+      data_offsets_data, query_points_data, query_offsets_data, k, dists,
+      query_out, data_out, batch_size, feature_size);

  device->FreeWorkspace(ctx, dists);
 }
@@ -477,9 +488,10 @@ void BruteForceKNNCuda(const NDArray& data_points, const IdArray& data_offsets,
 * \param result output array
 */
 template <typename FloatType, typename IdType>
-void BruteForceKNNSharedCuda(const NDArray& data_points, const IdArray& data_offsets,
-                             const NDArray& query_points, const IdArray& query_offsets,
-                             const int k, IdArray result) {
+void BruteForceKNNSharedCuda(
+    const NDArray& data_points, const IdArray& data_offsets,
+    const NDArray& query_points, const IdArray& query_offsets, const int k,
+    IdArray result) {
  cudaStream_t stream = runtime::getCurrentCUDAStream();
  const auto& ctx = data_points->ctx;
  auto device = runtime::DeviceAPI::Get(ctx);
@@ -496,44 +508,44 @@ void BruteForceKNNSharedCuda(const NDArray& data_points, const IdArray& data_off
  // determine block size according to this value
  int max_sharedmem_per_block = 0;
  CUDA_CALL(cudaDeviceGetAttribute(
-    &max_sharedmem_per_block, cudaDevAttrMaxSharedMemoryPerBlock, ctx.device_id));
-  const int64_t single_shared_mem = (k + 2 * feature_size) * sizeof(FloatType) +
-    k * sizeof(IdType);
-  const int64_t block_size = cuda::FindNumThreads(max_sharedmem_per_block / single_shared_mem);
+      &max_sharedmem_per_block, cudaDevAttrMaxSharedMemoryPerBlock,
+      ctx.device_id));
+  const int64_t single_shared_mem =
+      (k + 2 * feature_size) * sizeof(FloatType) + k * sizeof(IdType);
+  const int64_t block_size =
+      cuda::FindNumThreads(max_sharedmem_per_block / single_shared_mem);

  // Determine the number of blocks. We first get the number of blocks for each
  // segment. Then we get the block id offset via prefix sum.
  IdType* num_block_per_segment = static_cast<IdType*>(
-    device->AllocWorkspace(ctx, batch_size * sizeof(IdType)));
+      device->AllocWorkspace(ctx, batch_size * sizeof(IdType)));
  IdType* num_block_prefixsum = static_cast<IdType*>(
-    device->AllocWorkspace(ctx, batch_size * sizeof(IdType)));
+      device->AllocWorkspace(ctx, batch_size * sizeof(IdType)));

  // block size for GetNumBlockPerSegment computation
  int64_t temp_block_size = cuda::FindNumThreads(batch_size);
  int64_t temp_num_blocks = (batch_size - 1) / temp_block_size + 1;
-  CUDA_KERNEL_CALL(GetNumBlockPerSegment, temp_num_blocks,
-                   temp_block_size, 0, stream,
-                   query_offsets_data, num_block_per_segment,
-                   batch_size, block_size);
+  CUDA_KERNEL_CALL(
+      GetNumBlockPerSegment, temp_num_blocks, temp_block_size, 0, stream,
+      query_offsets_data, num_block_per_segment, batch_size, block_size);
  size_t prefix_temp_size = 0;
  CUDA_CALL(cub::DeviceScan::ExclusiveSum(
-    nullptr, prefix_temp_size, num_block_per_segment,
-    num_block_prefixsum, batch_size, stream));
+      nullptr, prefix_temp_size, num_block_per_segment, num_block_prefixsum,
+      batch_size, stream));
  void* prefix_temp = device->AllocWorkspace(ctx, prefix_temp_size);
  CUDA_CALL(cub::DeviceScan::ExclusiveSum(
-    prefix_temp, prefix_temp_size, num_block_per_segment,
-    num_block_prefixsum, batch_size, stream));
+      prefix_temp, prefix_temp_size, num_block_per_segment, num_block_prefixsum,
+      batch_size, stream));
  device->FreeWorkspace(ctx, prefix_temp);

-  int64_t num_blocks = 0, final_elem = 0, copyoffset = (batch_size - 1) * sizeof(IdType);
+  int64_t num_blocks = 0, final_elem = 0,
+          copyoffset = (batch_size - 1) * sizeof(IdType);
  device->CopyDataFromTo(
-    num_block_prefixsum, copyoffset, &num_blocks, 0,
-    sizeof(IdType), ctx, DGLContext{kDGLCPU, 0},
-    query_offsets->dtype);
+      num_block_prefixsum, copyoffset, &num_blocks, 0, sizeof(IdType), ctx,
+      DGLContext{kDGLCPU, 0}, query_offsets->dtype);
  device->CopyDataFromTo(
-    num_block_per_segment, copyoffset, &final_elem, 0,
-    sizeof(IdType), ctx, DGLContext{kDGLCPU, 0},
-    query_offsets->dtype);
+      num_block_per_segment, copyoffset, &final_elem, 0, sizeof(IdType), ctx,
+      DGLContext{kDGLCPU, 0}, query_offsets->dtype);
  num_blocks += final_elem;
  device->FreeWorkspace(ctx, num_block_per_segment);
  device->FreeWorkspace(ctx, num_block_prefixsum);
@@ -541,22 +553,22 @@ void BruteForceKNNSharedCuda(const NDArray& data_points, const IdArray& data_off
  // get batch id and local id in segment
  temp_block_size = cuda::FindNumThreads(num_blocks);
  temp_num_blocks = (num_blocks - 1) / temp_block_size + 1;
-  IdType* block_batch_id = static_cast<IdType*>(device->AllocWorkspace(
-    ctx, num_blocks * sizeof(IdType)));
-  IdType* local_block_id = static_cast<IdType*>(device->AllocWorkspace(
-    ctx, num_blocks * sizeof(IdType)));
+  IdType* block_batch_id = static_cast<IdType*>(
+      device->AllocWorkspace(ctx, num_blocks * sizeof(IdType)));
+  IdType* local_block_id = static_cast<IdType*>(
+      device->AllocWorkspace(ctx, num_blocks * sizeof(IdType)));
  CUDA_KERNEL_CALL(
-    GetBlockInfo, temp_num_blocks, temp_block_size, 0,
-    stream, num_block_prefixsum, block_batch_id,
-    local_block_id, batch_size, num_blocks);
+      GetBlockInfo, temp_num_blocks, temp_block_size, 0, stream,
+      num_block_prefixsum, block_batch_id, local_block_id, batch_size,
+      num_blocks);

  FloatType* dists = static_cast<FloatType*>(device->AllocWorkspace(
-    ctx, k * query_points->shape[0] * sizeof(FloatType)));
-  CUDA_KERNEL_CALL(BruteforceKnnShareKernel, num_blocks, block_size,
-    single_shared_mem * block_size, stream, data_points_data,
-    data_offsets_data, query_points_data, query_offsets_data,
-    block_batch_id, local_block_id, k, dists, query_out,
-    data_out, batch_size, feature_size);
+      ctx, k * query_points->shape[0] * sizeof(FloatType)));
+  CUDA_KERNEL_CALL(
+      BruteforceKnnShareKernel, num_blocks, block_size,
+      single_shared_mem * block_size, stream, data_points_data,
+      data_offsets_data, query_points_data, query_offsets_data, block_batch_id,
+      local_block_id, k, dists, query_out, data_out, batch_size, feature_size);

  device->FreeWorkspace(ctx, dists);
  device->FreeWorkspace(ctx, local_block_id);
@@ -564,9 +576,8 @@ void BruteForceKNNSharedCuda(const NDArray& data_points, const IdArray& data_off
 }

 /*! \brief Setup rng state for nn-descent */
-__global__ void SetupRngKernel(curandState* states,
-                               const uint64_t seed,
-                               const size_t n) {
+__global__ void SetupRngKernel(
+    curandState* states, const uint64_t seed, const size_t n) {
  size_t id = blockIdx.x * blockDim.x + threadIdx.x;
  if (id < n) {
    curand_init(seed, id, 0, states + id);
@@ -578,16 +589,10 @@ __global__ void SetupRngKernel(curandState* states,
 * for each nodes
 */
 template <typename FloatType, typename IdType>
-__global__ void RandomInitNeighborsKernel(const FloatType* points,
-                                          const IdType* offsets,
-                                          IdType* central_nodes,
-                                          IdType* neighbors,
-                                          FloatType* dists,
-                                          bool* flags,
-                                          const int k,
-                                          const int64_t feature_size,
-                                          const int64_t batch_size,
-                                          const uint64_t seed) {
+__global__ void RandomInitNeighborsKernel(
+    const FloatType* points, const IdType* offsets, IdType* central_nodes,
+    IdType* neighbors, FloatType* dists, bool* flags, const int k,
+    const int64_t feature_size, const int64_t batch_size, const uint64_t seed) {
  const IdType point_idx = blockIdx.x * blockDim.x + threadIdx.x;
  IdType batch_idx = 0;
  if (point_idx >= offsets[batch_size]) return;
@@ -623,21 +628,23 @@ __global__ void RandomInitNeighborsKernel(const FloatType* points,
  for (IdType i = 0; i < k; ++i) {
    current_flags[i] = true;
    current_dists[i] = EuclideanDist<FloatType, IdType>(
-      points + point_idx * feature_size,
-      points + current_neighbors[i] * feature_size,
-      feature_size);
+        points + point_idx * feature_size,
+        points + current_neighbors[i] * feature_size, feature_size);
  }

  // build heap
  BuildHeap<FloatType, IdType>(neighbors + point_idx * k, current_dists, k);
 }

-/*! \brief Randomly select candidates from current knn and reverse-knn graph for nn-descent */
+/*!
+ * \brief Randomly select candidates from current knn and reverse-knn graph for
+ *        nn-descent.
+ */
 template <typename IdType>
-__global__ void FindCandidatesKernel(const IdType* offsets, IdType* new_candidates,
-                                     IdType* old_candidates, IdType* neighbors, bool* flags,
-                                     const uint64_t seed, const int64_t batch_size,
-                                     const int num_candidates, const int k) {
+__global__ void FindCandidatesKernel(
+    const IdType* offsets, IdType* new_candidates, IdType* old_candidates,
+    IdType* neighbors, bool* flags, const uint64_t seed,
+    const int64_t batch_size, const int num_candidates, const int k) {
  const IdType point_idx = blockIdx.x * blockDim.x + threadIdx.x;
  IdType batch_idx = 0;
  if (point_idx >= offsets[batch_size]) return;
@@ -652,13 +659,16 @@ __global__ void FindCandidatesKernel(const IdType* offsets, IdType* new_candidat
    }
  }

-  IdType segment_start = offsets[batch_idx], segment_end = offsets[batch_idx + 1];
+  IdType segment_start = offsets[batch_idx],
+         segment_end = offsets[batch_idx + 1];
  IdType* current_neighbors = neighbors + point_idx * k;
  bool* current_flags = flags + point_idx * k;

  // reset candidates
-  IdType* new_candidates_ptr = new_candidates + point_idx * (num_candidates + 1);
-  IdType* old_candidates_ptr = old_candidates + point_idx * (num_candidates + 1);
+  IdType* new_candidates_ptr =
+      new_candidates + point_idx * (num_candidates + 1);
+  IdType* old_candidates_ptr =
+      old_candidates + point_idx * (num_candidates + 1);
  new_candidates_ptr[0] = 0;
  old_candidates_ptr[0] = 0;

@@ -666,7 +676,8 @@ __global__ void FindCandidatesKernel(const IdType* offsets, IdType* new_candidat
  // here we use candidate[0] for reservoir sampling temporarily
  for (IdType i = 0; i < k; ++i) {
    IdType candidate = current_neighbors[i];
-    IdType* candidate_array = current_flags[i] ? new_candidates_ptr : old_candidates_ptr;
+    IdType* candidate_array =
+        current_flags[i] ? new_candidates_ptr : old_candidates_ptr;
    IdType curr_num = candidate_array[0];
    IdType* candidate_data = candidate_array + 1;

@@ -686,7 +697,8 @@ __global__ void FindCandidatesKernel(const IdType* offsets, IdType* new_candidat
  for (IdType i = index_start; i < index_end; ++i) {
    if (neighbors[i] == point_idx) {
      IdType reverse_candidate = (i - index_start) / k + segment_start;
-      IdType* candidate_array = flags[i] ? new_candidates_ptr : old_candidates_ptr;
+      IdType* candidate_array =
+          flags[i] ? new_candidates_ptr : old_candidates_ptr;
      IdType curr_num = candidate_array[0];
      IdType* candidate_data = candidate_array + 1;

@@ -702,8 +714,10 @@ __global__ void FindCandidatesKernel(const IdType* offsets, IdType* new_candidat
  }

  // set candidate[0] back to length
-  if (new_candidates_ptr[0] > num_candidates) new_candidates_ptr[0] = num_candidates;
-  if (old_candidates_ptr[0] > num_candidates) old_candidates_ptr[0] = num_candidates;
+  if (new_candidates_ptr[0] > num_candidates)
+    new_candidates_ptr[0] = num_candidates;
+  if (old_candidates_ptr[0] > num_candidates)
+    old_candidates_ptr[0] = num_candidates;

  // mark new_candidates as old
  IdType num_new_candidates = new_candidates_ptr[0];
@@ -723,19 +737,20 @@ __global__ void FindCandidatesKernel(const IdType* offsets, IdType* new_candidat

 /*! \brief Update knn graph according to selected candidates for nn-descent */
 template <typename FloatType, typename IdType>
-__global__ void UpdateNeighborsKernel(const FloatType* points, const IdType* offsets,
-                                      IdType* neighbors, IdType* new_candidates,
-                                      IdType* old_candidates, FloatType* distances,
-                                      bool* flags, IdType* num_updates,
-                                      const int64_t batch_size, const int num_candidates,
-                                      const int k, const int64_t feature_size) {
+__global__ void UpdateNeighborsKernel(
+    const FloatType* points, const IdType* offsets, IdType* neighbors,
+    IdType* new_candidates, IdType* old_candidates, FloatType* distances,
+    bool* flags, IdType* num_updates, const int64_t batch_size,
+    const int num_candidates, const int k, const int64_t feature_size) {
  const IdType point_idx = blockIdx.x * blockDim.x + threadIdx.x;
  if (point_idx >= offsets[batch_size]) return;
  IdType* current_neighbors = neighbors + point_idx * k;
  bool* current_flags = flags + point_idx * k;
  FloatType* current_dists = distances + point_idx * k;
-  IdType* new_candidates_ptr = new_candidates + point_idx * (num_candidates + 1);
-  IdType* old_candidates_ptr = old_candidates + point_idx * (num_candidates + 1);
+  IdType* new_candidates_ptr =
+      new_candidates + point_idx * (num_candidates + 1);
+  IdType* old_candidates_ptr =
+      old_candidates + point_idx * (num_candidates + 1);
  IdType num_new_candidates = new_candidates_ptr[0];
  IdType num_old_candidates = old_candidates_ptr[0];
  IdType current_num_updates = 0;
@@ -755,15 +770,14 @@ __global__ void UpdateNeighborsKernel(const FloatType* points, const IdType* off
    for (IdType j = 1; j <= num_twohop_new; ++j) {
      IdType twohop_new_c = twohop_new_ptr[j];
      FloatType new_dist = EuclideanDistWithCheck<FloatType, IdType>(
-        points + point_idx * feature_size,
-        points + twohop_new_c * feature_size,
-        feature_size, worst_dist);
+          points + point_idx * feature_size,
+          points + twohop_new_c * feature_size, feature_size, worst_dist);

      if (FlaggedHeapInsert<FloatType, IdType>(
-          current_neighbors, current_dists, current_flags,
-          twohop_new_c, new_dist, true, k, true)) {
-            ++current_num_updates;
-            worst_dist = current_dists[0];
+              current_neighbors, current_dists, current_flags, twohop_new_c,
+              new_dist, true, k, true)) {
+        ++current_num_updates;
+        worst_dist = current_dists[0];
      }
    }

@@ -771,15 +785,14 @@ __global__ void UpdateNeighborsKernel(const FloatType* points, const IdType* off
    for (IdType j = 1; j <= num_twohop_old; ++j) {
      IdType twohop_old_c = twohop_old_ptr[j];
      FloatType new_dist = EuclideanDistWithCheck<FloatType, IdType>(
-        points + point_idx * feature_size,
-        points + twohop_old_c * feature_size,
-        feature_size, worst_dist);
+          points + point_idx * feature_size,
+          points + twohop_old_c * feature_size, feature_size, worst_dist);

      if (FlaggedHeapInsert<FloatType, IdType>(
-        current_neighbors, current_dists, current_flags,
-        twohop_old_c, new_dist, true, k, true)) {
-          ++current_num_updates;
-          worst_dist = current_dists[0];
+              current_neighbors, current_dists, current_flags, twohop_old_c,
+              new_dist, true, k, true)) {
+        ++current_num_updates;
+        worst_dist = current_dists[0];
      }
    }
  }
@@ -797,15 +810,14 @@ __global__ void UpdateNeighborsKernel(const FloatType* points, const IdType* off
    for (IdType j = 1; j <= num_twohop_new; ++j) {
      IdType twohop_new_c = twohop_new_ptr[j];
      FloatType new_dist = EuclideanDistWithCheck<FloatType, IdType>(
-        points + point_idx * feature_size,
-        points + twohop_new_c * feature_size,
-        feature_size, worst_dist);
+          points + point_idx * feature_size,
+          points + twohop_new_c * feature_size, feature_size, worst_dist);

      if (FlaggedHeapInsert<FloatType, IdType>(
-        current_neighbors, current_dists, current_flags,
-        twohop_new_c, new_dist, true, k, true)) {
-          ++current_num_updates;
-          worst_dist = current_dists[0];
+              current_neighbors, current_dists, current_flags, twohop_new_c,
+              new_dist, true, k, true)) {
+        ++current_num_updates;
+        worst_dist = current_dists[0];
      }
    }
  }
@@ -816,24 +828,25 @@ __global__ void UpdateNeighborsKernel(const FloatType* points, const IdType* off
 }  // namespace impl

 template <DGLDeviceType XPU, typename FloatType, typename IdType>
-void KNN(const NDArray& data_points, const IdArray& data_offsets,
-         const NDArray& query_points, const IdArray& query_offsets,
-         const int k, IdArray result, const std::string& algorithm) {
+void KNN(
+    const NDArray& data_points, const IdArray& data_offsets,
+    const NDArray& query_points, const IdArray& query_offsets, const int k,
+    IdArray result, const std::string& algorithm) {
  if (algorithm == std::string("bruteforce")) {
    impl::BruteForceKNNCuda<FloatType, IdType>(
-      data_points, data_offsets, query_points, query_offsets, k, result);
+        data_points, data_offsets, query_points, query_offsets, k, result);
  } else if (algorithm == std::string("bruteforce-sharemem")) {
    impl::BruteForceKNNSharedCuda<FloatType, IdType>(
-      data_points, data_offsets, query_points, query_offsets, k, result);
+        data_points, data_offsets, query_points, query_offsets, k, result);
  } else {
    LOG(FATAL) << "Algorithm " << algorithm << " is not supported on CUDA.";
  }
 }

 template <DGLDeviceType XPU, typename FloatType, typename IdType>
-void NNDescent(const NDArray& points, const IdArray& offsets,
-               IdArray result, const int k, const int num_iters,
-               const int num_candidates, const double delta) {
+void NNDescent(
+    const NDArray& points, const IdArray& offsets, IdArray result, const int k,
+    const int num_iters, const int num_candidates, const double delta) {
  cudaStream_t stream = runtime::getCurrentCUDAStream();
  const auto& ctx = points->ctx;
  auto device = runtime::DeviceAPI::Get(ctx);
@@ -847,66 +860,68 @@ void NNDescent(const NDArray& points, const IdArray& offsets,
  IdType* neighbors = central_nodes + k * num_nodes;
  uint64_t seed;
  int warp_size = 0;
-  CUDA_CALL(cudaDeviceGetAttribute(
-    &warp_size, cudaDevAttrWarpSize, ctx.device_id));
-  // We don't need large block sizes, since there's not much inter-thread communication
+  CUDA_CALL(
+      cudaDeviceGetAttribute(&warp_size, cudaDevAttrWarpSize, ctx.device_id));
+  // We don't need large block sizes, since there's not much inter-thread
+  // communication
  int64_t block_size = warp_size;
  int64_t num_blocks = (num_nodes - 1) / block_size + 1;

  // allocate space for candidates, distances and flags
  // we use the first element in candidate array to represent length
-  IdType* new_candidates = static_cast<IdType*>(
-    device->AllocWorkspace(ctx, num_nodes * (num_candidates + 1) * sizeof(IdType)));
-  IdType* old_candidates = static_cast<IdType*>(
-    device->AllocWorkspace(ctx, num_nodes * (num_candidates + 1) * sizeof(IdType)));
+  IdType* new_candidates = static_cast<IdType*>(device->AllocWorkspace(
+      ctx, num_nodes * (num_candidates + 1) * sizeof(IdType)));
+  IdType* old_candidates = static_cast<IdType*>(device->AllocWorkspace(
+      ctx, num_nodes * (num_candidates + 1) * sizeof(IdType)));
  IdType* num_updates = static_cast<IdType*>(
-    device->AllocWorkspace(ctx, num_nodes * sizeof(IdType)));
+      device->AllocWorkspace(ctx, num_nodes * sizeof(IdType)));
  FloatType* distances = static_cast<FloatType*>(
-    device->AllocWorkspace(ctx, num_nodes * k * sizeof(IdType)));
+      device->AllocWorkspace(ctx, num_nodes * k * sizeof(IdType)));
  bool* flags = static_cast<bool*>(
-    device->AllocWorkspace(ctx, num_nodes * k * sizeof(IdType)));
+      device->AllocWorkspace(ctx, num_nodes * k * sizeof(IdType)));

  size_t sum_temp_size = 0;
  IdType total_num_updates = 0;
-  IdType* total_num_updates_d = static_cast<IdType*>(
-    device->AllocWorkspace(ctx, sizeof(IdType)));
+  IdType* total_num_updates_d =
+      static_cast<IdType*>(device->AllocWorkspace(ctx, sizeof(IdType)));

  CUDA_CALL(cub::DeviceReduce::Sum(
-    nullptr, sum_temp_size, num_updates, total_num_updates_d, num_nodes, stream));
-  IdType* sum_temp_storage = static_cast<IdType*>(
-    device->AllocWorkspace(ctx, sum_temp_size));
+      nullptr, sum_temp_size, num_updates, total_num_updates_d, num_nodes,
+      stream));
+  IdType* sum_temp_storage =
+      static_cast<IdType*>(device->AllocWorkspace(ctx, sum_temp_size));

  // random initialize neighbors
  seed = RandomEngine::ThreadLocal()->RandInt<uint64_t>(
-    std::numeric_limits<uint64_t>::max());
+      std::numeric_limits<uint64_t>::max());
  CUDA_KERNEL_CALL(
-    impl::RandomInitNeighborsKernel, num_blocks, block_size, 0, stream,
-    points_data, offsets_data, central_nodes, neighbors, distances, flags, k,
-    feature_size, batch_size, seed);
+      impl::RandomInitNeighborsKernel, num_blocks, block_size, 0, stream,
+      points_data, offsets_data, central_nodes, neighbors, distances, flags, k,
+      feature_size, batch_size, seed);

  for (int i = 0; i < num_iters; ++i) {
    // select candidates
    seed = RandomEngine::ThreadLocal()->RandInt<uint64_t>(
-      std::numeric_limits<uint64_t>::max());
+        std::numeric_limits<uint64_t>::max());
    CUDA_KERNEL_CALL(
-      impl::FindCandidatesKernel, num_blocks, block_size, 0,
-      stream, offsets_data, new_candidates, old_candidates, neighbors,
-      flags, seed, batch_size, num_candidates, k);
+        impl::FindCandidatesKernel, num_blocks, block_size, 0, stream,
+        offsets_data, new_candidates, old_candidates, neighbors, flags, seed,
+        batch_size, num_candidates, k);

    // update
    CUDA_KERNEL_CALL(
-      impl::UpdateNeighborsKernel, num_blocks, block_size, 0, stream,
-      points_data, offsets_data, neighbors, new_candidates, old_candidates, distances,
-      flags, num_updates, batch_size, num_candidates, k, feature_size);
+        impl::UpdateNeighborsKernel, num_blocks, block_size, 0, stream,
+        points_data, offsets_data, neighbors, new_candidates, old_candidates,
+        distances, flags, num_updates, batch_size, num_candidates, k,
+        feature_size);

    total_num_updates = 0;
    CUDA_CALL(cub::DeviceReduce::Sum(
-      sum_temp_storage, sum_temp_size, num_updates, total_num_updates_d, num_nodes,
-      stream));
+        sum_temp_storage, sum_temp_size, num_updates, total_num_updates_d,
+        num_nodes, stream));
    device->CopyDataFromTo(
-      total_num_updates_d, 0, &total_num_updates, 0,
-      sizeof(IdType), ctx, DGLContext{kDGLCPU, 0},
-      offsets->dtype);
+        total_num_updates_d, 0, &total_num_updates, 0, sizeof(IdType), ctx,
+        DGLContext{kDGLCPU, 0}, offsets->dtype);

    if (total_num_updates <= static_cast<IdType>(delta * k * num_nodes)) {
      break;
@@ -923,38 +938,34 @@ void NNDescent(const NDArray& points, const IdArray& offsets,
 }

 template void KNN<kDGLCUDA, float, int32_t>(
-  const NDArray& data_points, const IdArray& data_offsets,
-  const NDArray& query_points, const IdArray& query_offsets,
-  const int k, IdArray result, const std::string& algorithm);
+    const NDArray& data_points, const IdArray& data_offsets,
+    const NDArray& query_points, const IdArray& query_offsets, const int k,
+    IdArray result, const std::string& algorithm);
 template void KNN<kDGLCUDA, float, int64_t>(
-  const NDArray& data_points, const IdArray& data_offsets,
-  const NDArray& query_points, const IdArray& query_offsets,
-  const int k, IdArray result, const std::string& algorithm);
+    const NDArray& data_points, const IdArray& data_offsets,
+    const NDArray& query_points, const IdArray& query_offsets, const int k,
+    IdArray result, const std::string& algorithm);
 template void KNN<kDGLCUDA, double, int32_t>(
-  const NDArray& data_points, const IdArray& data_offsets,
-  const NDArray& query_points, const IdArray& query_offsets,
-  const int k, IdArray result, const std::string& algorithm);
+    const NDArray& data_points, const IdArray& data_offsets,
+    const NDArray& query_points, const IdArray& query_offsets, const int k,
+    IdArray result, const std::string& algorithm);
 template void KNN<kDGLCUDA, double, int64_t>(
-  const NDArray& data_points, const IdArray& data_offsets,
-  const NDArray& query_points, const IdArray& query_offsets,
-  const int k, IdArray result, const std::string& algorithm);
+    const NDArray& data_points, const IdArray& data_offsets,
+    const NDArray& query_points, const IdArray& query_offsets, const int k,
+    IdArray result, const std::string& algorithm);

 template void NNDescent<kDGLCUDA, float, int32_t>(
-  const NDArray& points, const IdArray& offsets,
-  IdArray result, const int k, const int num_iters,
-  const int num_candidates, const double delta);
+    const NDArray& points, const IdArray& offsets, IdArray result, const int k,
+    const int num_iters, const int num_candidates, const double delta);
 template void NNDescent<kDGLCUDA, float, int64_t>(
-  const NDArray& points, const IdArray& offsets,
-  IdArray result, const int k, const int num_iters,
-  const int num_candidates, const double delta);
+    const NDArray& points, const IdArray& offsets, IdArray result, const int k,
+    const int num_iters, const int num_candidates, const double delta);
 template void NNDescent<kDGLCUDA, double, int32_t>(
-  const NDArray& points, const IdArray& offsets,
-  IdArray result, const int k, const int num_iters,
-  const int num_candidates, const double delta);
+    const NDArray& points, const IdArray& offsets, IdArray result, const int k,
+    const int num_iters, const int num_candidates, const double delta);
 template void NNDescent<kDGLCUDA, double, int64_t>(
-  const NDArray& points, const IdArray& offsets,
-  IdArray result, const int k, const int num_iters,
-  const int num_candidates, const double delta);
+    const NDArray& points, const IdArray& offsets, IdArray result, const int k,
+    const int num_iters, const int num_candidates, const double delta);

 }  // namespace transform
 }  // namespace dgl
--- a/src/graph/transform/knn.cc
+++ b/src/graph/transform/knn.cc
@@ -4,9 +4,11 @@
 * \brief k-nearest-neighbor (KNN) interface
 */

-#include <dgl/runtime/registry.h>
-#include <dgl/runtime/packed_func.h>
 #include "knn.h"
+
+#include <dgl/runtime/packed_func.h>
+#include <dgl/runtime/registry.h>
+
 #include "../../array/check.h"

 using namespace dgl::runtime;
@@ -14,57 +16,59 @@ namespace dgl {
 namespace transform {

 DGL_REGISTER_GLOBAL("transform._CAPI_DGLKNN")
-.set_body([] (DGLArgs args, DGLRetValue* rv) {
-    const NDArray data_points = args[0];
-    const IdArray data_offsets = args[1];
-    const NDArray query_points = args[2];
-    const IdArray query_offsets = args[3];
-    const int k = args[4];
-    IdArray result = args[5];
-    const std::string algorithm = args[6];
+    .set_body([](DGLArgs args, DGLRetValue* rv) {
+      const NDArray data_points = args[0];
+      const IdArray data_offsets = args[1];
+      const NDArray query_points = args[2];
+      const IdArray query_offsets = args[3];
+      const int k = args[4];
+      IdArray result = args[5];
+      const std::string algorithm = args[6];

-    aten::CheckContiguous(
-      {data_points, data_offsets, query_points, query_offsets, result},
-      {"data_points", "data_offsets", "query_points", "query_offsets", "result"});
-    aten::CheckCtx(
-      data_points->ctx, {data_offsets, query_points, query_offsets, result},
-      {"data_offsets", "query_points", "query_offsets", "result"});
+      aten::CheckContiguous(
+          {data_points, data_offsets, query_points, query_offsets, result},
+          {"data_points", "data_offsets", "query_points", "query_offsets",
+           "result"});
+      aten::CheckCtx(
+          data_points->ctx, {data_offsets, query_points, query_offsets, result},
+          {"data_offsets", "query_points", "query_offsets", "result"});

-    ATEN_XPU_SWITCH_CUDA(data_points->ctx.device_type, XPU, "KNN", {
-      ATEN_FLOAT_TYPE_SWITCH(data_points->dtype, FloatType, "data_points", {
-        ATEN_ID_TYPE_SWITCH(result->dtype, IdType, {
-          KNN<XPU, FloatType, IdType>(
-            data_points, data_offsets, query_points,
-            query_offsets, k, result, algorithm);
+      ATEN_XPU_SWITCH_CUDA(data_points->ctx.device_type, XPU, "KNN", {
+        ATEN_FLOAT_TYPE_SWITCH(data_points->dtype, FloatType, "data_points", {
+          ATEN_ID_TYPE_SWITCH(result->dtype, IdType, {
+            KNN<XPU, FloatType, IdType>(
+                data_points, data_offsets, query_points, query_offsets, k,
+                result, algorithm);
+          });
        });
      });
    });
-  });

 DGL_REGISTER_GLOBAL("transform._CAPI_DGLNNDescent")
-.set_body([] (DGLArgs args, DGLRetValue* rv) {
-    const NDArray points = args[0];
-    const IdArray offsets = args[1];
-    const IdArray result = args[2];
-    const int k = args[3];
-    const int num_iters = args[4];
-    const int num_candidates = args[5];
-    const double delta = args[6];
+    .set_body([](DGLArgs args, DGLRetValue* rv) {
+      const NDArray points = args[0];
+      const IdArray offsets = args[1];
+      const IdArray result = args[2];
+      const int k = args[3];
+      const int num_iters = args[4];
+      const int num_candidates = args[5];
+      const double delta = args[6];

-    aten::CheckContiguous(
-      {points, offsets, result}, {"points", "offsets", "result"});
-    aten::CheckCtx(
-      points->ctx, {points, offsets, result}, {"points", "offsets", "result"});
+      aten::CheckContiguous(
+          {points, offsets, result}, {"points", "offsets", "result"});
+      aten::CheckCtx(
+          points->ctx, {points, offsets, result},
+          {"points", "offsets", "result"});

-    ATEN_XPU_SWITCH_CUDA(points->ctx.device_type, XPU, "NNDescent", {
-      ATEN_FLOAT_TYPE_SWITCH(points->dtype, FloatType, "points", {
-        ATEN_ID_TYPE_SWITCH(result->dtype, IdType, {
-          NNDescent<XPU, FloatType, IdType>(
-            points, offsets, result, k, num_iters, num_candidates, delta);
+      ATEN_XPU_SWITCH_CUDA(points->ctx.device_type, XPU, "NNDescent", {
+        ATEN_FLOAT_TYPE_SWITCH(points->dtype, FloatType, "points", {
+          ATEN_ID_TYPE_SWITCH(result->dtype, IdType, {
+            NNDescent<XPU, FloatType, IdType>(
+                points, offsets, result, k, num_iters, num_candidates, delta);
+          });
        });
      });
    });
-  });

 }  // namespace transform
 }  // namespace dgl
--- a/src/graph/transform/knn.h
+++ b/src/graph/transform/knn.h
@@ -8,6 +8,7 @@
 #define DGL_GRAPH_TRANSFORM_KNN_H_

 #include <dgl/array.h>
+
 #include <string>

 namespace dgl {
@@ -15,42 +16,45 @@ namespace transform {

 /*!
 * \brief For each point in each segment in \a query_points, find \a k nearest
- *  points in the same segment in \a data_points. \a data_offsets and \a query_offsets
- *  determine the start index of each segment in \a data_points and \a query_points.
+ *        points in the same segment in \a data_points. \a data_offsets and \a
+ *        query_offsets determine the start index of each segment in \a
+ *        data_points and \a query_points.
 *
 * \param data_points dataset points.
 * \param data_offsets offsets of point index in \a data_points.
 * \param query_points query points.
 * \param query_offsets offsets of point index in \a query_points.
 * \param k the number of nearest points.
- * \param result output array. A 2D tensor indicating the index
- *  relation between \a query_points and \a data_points.
+ * \param result output array. A 2D tensor indicating the index  relation
+ *        between \a query_points and \a data_points.
 * \param algorithm algorithm used to compute the k-nearest neighbors.
 */
 template <DGLDeviceType XPU, typename FloatType, typename IdType>
-void KNN(const NDArray& data_points, const IdArray& data_offsets,
-         const NDArray& query_points, const IdArray& query_offsets,
-         const int k, IdArray result, const std::string& algorithm);
+void KNN(
+    const NDArray& data_points, const IdArray& data_offsets,
+    const NDArray& query_points, const IdArray& query_offsets, const int k,
+    IdArray result, const std::string& algorithm);

 /*!
 * \brief For each input point, find \a k approximate nearest points in the same
- *  segment using NN-descent algorithm.
+ *        segment using NN-descent algorithm.
 *
 * \param points input points.
 * \param offsets offsets of point index.
- * \param result output array. A 2D tensor indicating the index relation between points.
+ * \param result output array. A 2D tensor indicating the index relation between
+ *        points.
 * \param k the number of nearest points.
 * \param num_iters The maximum number of NN-descent iterations to perform.
- * \param num_candidates The maximum number of candidates to be considered during one iteration.
+ * \param num_candidates The maximum number of candidates to be considered
+ *        during one iteration.
 * \param delta A value controls the early abort.
 */
 template <DGLDeviceType XPU, typename FloatType, typename IdType>
-void NNDescent(const NDArray& points, const IdArray& offsets,
-               IdArray result, const int k, const int num_iters,
-               const int num_candidates, const double delta);
+void NNDescent(
+    const NDArray& points, const IdArray& offsets, IdArray result, const int k,
+    const int num_iters, const int num_candidates, const double delta);

 }  // namespace transform
 }  // namespace dgl

-
 #endif  // DGL_GRAPH_TRANSFORM_KNN_H_
--- a/src/graph/transform/line_graph.cc
+++ b/src/graph/transform/line_graph.cc
@@ -4,12 +4,14 @@
 * \brief Line graph implementation
 */

-#include <dgl/base_heterograph.h>
-#include <dgl/transform.h>
 #include <dgl/array.h>
+#include <dgl/base_heterograph.h>
 #include <dgl/packed_func_ext.h>
-#include <vector>
+#include <dgl/transform.h>
+
 #include <utility>
+#include <vector>
+
 #include "../../c_api_common.h"
 #include "../heterograph.h"

@@ -21,26 +23,25 @@ using namespace dgl::aten;
 namespace transform {

 /*!
- * \brief Create Line Graph
- * \param hg Graph
- * \param backtracking whether the pair of (v, u) (u, v) edges are treated as linked
- * \return The Line Graph
+ * \brief Create Line Graph.
+ * \param hg Graph.
+ * \param backtracking whether the pair of (v, u) (u, v) edges are treated as
+ *        linked.
+ * \return The Line Graph.
 */
-HeteroGraphPtr CreateLineGraph(
-    HeteroGraphPtr hg,
-    bool backtracking) {
+HeteroGraphPtr CreateLineGraph(HeteroGraphPtr hg, bool backtracking) {
  const auto hgp = std::dynamic_pointer_cast<HeteroGraph>(hg);
  return hgp->LineGraph(backtracking);
 }

 DGL_REGISTER_GLOBAL("transform._CAPI_DGLHeteroLineGraph")
-.set_body([] (DGLArgs args, DGLRetValue* rv) {
-  HeteroGraphRef hg = args[0];
-  bool backtracking = args[1];
+    .set_body([](DGLArgs args, DGLRetValue* rv) {
+      HeteroGraphRef hg = args[0];
+      bool backtracking = args[1];

-  auto hgptr = CreateLineGraph(hg.sptr(), backtracking);
-  *rv = HeteroGraphRef(hgptr);
-});
+      auto hgptr = CreateLineGraph(hg.sptr(), backtracking);
+      *rv = HeteroGraphRef(hgptr);
+    });

 };  // namespace transform
 };  // namespace dgl
--- a/src/graph/transform/metis_partition_hetero.cc
+++ b/src/graph/transform/metis_partition_hetero.cc
@@ -19,14 +19,15 @@ namespace transform {

 #if !defined(_WIN32)

-IdArray MetisPartition(UnitGraphPtr g, int k, NDArray vwgt_arr,
-                       const std::string &mode, bool obj_cut) {
+IdArray MetisPartition(
+    UnitGraphPtr g, int k, NDArray vwgt_arr, const std::string &mode,
+    bool obj_cut) {
  // Mode can only be "k-way" or "recursive"
  CHECK(mode == "k-way" || mode == "recursive")
-    << "mode can only be \"k-way\" or \"recursive\"";
+      << "mode can only be \"k-way\" or \"recursive\"";
  // The index type of Metis needs to be compatible with DGL index type.
  CHECK_EQ(sizeof(idx_t), sizeof(int64_t))
-    << "Metis only supports int64 graph for now";
+      << "Metis only supports int64 graph for now";
  // This is a symmetric graph, so in-csr and out-csr are the same.
  const auto mat = g->GetCSCMatrix(0);
  //   const auto mat = g->GetInCSR()->ToCSRMatrix();
@@ -42,16 +43,17 @@ IdArray MetisPartition(UnitGraphPtr g, int k, NDArray vwgt_arr,

  int64_t vwgt_len = vwgt_arr->shape[0];
  CHECK_EQ(sizeof(idx_t), vwgt_arr->dtype.bits / 8)
-    << "The vertex weight array doesn't have right type";
+      << "The vertex weight array doesn't have right type";
  CHECK(vwgt_len % g->NumVertices(0) == 0)
-    << "The vertex weight array doesn't have right number of elements";
+      << "The vertex weight array doesn't have right number of elements";
  idx_t *vwgt = NULL;
  if (vwgt_len > 0) {
    ncon = vwgt_len / g->NumVertices(0);
    vwgt = static_cast<idx_t *>(vwgt_arr->data);
  }

-  auto partition_func = (mode == "k-way") ? METIS_PartGraphKway : METIS_PartGraphRecursive;
+  auto partition_func =
+      (mode == "k-way") ? METIS_PartGraphKway : METIS_PartGraphRecursive;

  idx_t options[METIS_NOPTIONS];
  METIS_SetDefaultOptions(options);
@@ -67,21 +69,21 @@ IdArray MetisPartition(UnitGraphPtr g, int k, NDArray vwgt_arr,
  }

  int ret = partition_func(
-    &nvtxs,  // The number of vertices
-    &ncon,   // The number of balancing constraints.
-    xadj,    // indptr
-    adjncy,  // indices
-    vwgt,    // the weights of the vertices
-    NULL,    // The size of the vertices for computing
-    // the total communication volume
-    NULL,     // The weights of the edges
-    &nparts,  // The number of partitions.
-    NULL,     // the desired weight for each partition and constraint
-    NULL,     // the allowed load imbalance tolerance
-    options,  // the array of options
-    &objval,  // the edge-cut or the total communication volume of
-    // the partitioning solution
-    part);
+      &nvtxs,  // The number of vertices
+      &ncon,   // The number of balancing constraints.
+      xadj,    // indptr
+      adjncy,  // indices
+      vwgt,    // the weights of the vertices
+      NULL,    // The size of the vertices for computing
+      // the total communication volume
+      NULL,     // The weights of the edges
+      &nparts,  // The number of partitions.
+      NULL,     // the desired weight for each partition and constraint
+      NULL,     // the allowed load imbalance tolerance
+      options,  // the array of options
+      &objval,  // the edge-cut or the total communication volume of
+      // the partitioning solution
+      part);

  if (obj_cut) {
    LOG(INFO) << "Partition a graph with " << g->NumVertices(0) << " nodes and "
@@ -110,22 +112,22 @@ IdArray MetisPartition(UnitGraphPtr g, int k, NDArray vwgt_arr,
 #endif  // !defined(_WIN32)

 DGL_REGISTER_GLOBAL("partition._CAPI_DGLMetisPartition_Hetero")
-  .set_body([](DGLArgs args, DGLRetValue *rv) {
-    HeteroGraphRef g = args[0];
-    auto hgptr = std::dynamic_pointer_cast<HeteroGraph>(g.sptr());
-    CHECK(hgptr) << "Invalid HeteroGraph object";
-    CHECK_EQ(hgptr->relation_graphs().size(), 1)
-      << "Metis partition only supports HomoGraph";
-    auto ugptr = hgptr->relation_graphs()[0];
-    int k = args[1];
-    NDArray vwgt = args[2];
-    std::string mode = args[3];
-    bool obj_cut = args[4];
+    .set_body([](DGLArgs args, DGLRetValue *rv) {
+      HeteroGraphRef g = args[0];
+      auto hgptr = std::dynamic_pointer_cast<HeteroGraph>(g.sptr());
+      CHECK(hgptr) << "Invalid HeteroGraph object";
+      CHECK_EQ(hgptr->relation_graphs().size(), 1)
+          << "Metis partition only supports HomoGraph";
+      auto ugptr = hgptr->relation_graphs()[0];
+      int k = args[1];
+      NDArray vwgt = args[2];
+      std::string mode = args[3];
+      bool obj_cut = args[4];
 #if !defined(_WIN32)
-    *rv = MetisPartition(ugptr, k, vwgt, mode, obj_cut);
+      *rv = MetisPartition(ugptr, k, vwgt, mode, obj_cut);
 #else
-    LOG(FATAL) << "Metis partition does not support Windows.";
+      LOG(FATAL) << "Metis partition does not support Windows.";
 #endif  // !defined(_WIN32)
-  });
+    });
 }  // namespace transform
 }  // namespace dgl
--- a/src/graph/transform/partition_hetero.cc
+++ b/src/graph/transform/partition_hetero.cc
@@ -37,25 +37,28 @@ HeteroGraphPtr ReorderUnitGraph(UnitGraphPtr ug, IdArray new_order) {
  if (format & CSC_CODE) {
    auto cscmat = ug->GetCSCMatrix(0);
    auto new_cscmat = aten::CSRReorder(cscmat, new_order, new_order);
-    return UnitGraph::CreateFromCSC(ug->NumVertexTypes(), new_cscmat, ug->GetAllowedFormats());
+    return UnitGraph::CreateFromCSC(
+        ug->NumVertexTypes(), new_cscmat, ug->GetAllowedFormats());
  } else if (format & CSR_CODE) {
    auto csrmat = ug->GetCSRMatrix(0);
    auto new_csrmat = aten::CSRReorder(csrmat, new_order, new_order);
-    return UnitGraph::CreateFromCSR(ug->NumVertexTypes(), new_csrmat, ug->GetAllowedFormats());
+    return UnitGraph::CreateFromCSR(
+        ug->NumVertexTypes(), new_csrmat, ug->GetAllowedFormats());
  } else {
    auto coomat = ug->GetCOOMatrix(0);
    auto new_coomat = aten::COOReorder(coomat, new_order, new_order);
-    return UnitGraph::CreateFromCOO(ug->NumVertexTypes(), new_coomat, ug->GetAllowedFormats());
+    return UnitGraph::CreateFromCOO(
+        ug->NumVertexTypes(), new_coomat, ug->GetAllowedFormats());
  }
 }

-HaloHeteroSubgraph GetSubgraphWithHalo(std::shared_ptr<HeteroGraph> hg,
-                                       IdArray nodes, int num_hops) {
+HaloHeteroSubgraph GetSubgraphWithHalo(
+    std::shared_ptr<HeteroGraph> hg, IdArray nodes, int num_hops) {
  CHECK_EQ(hg->NumBits(), 64) << "halo subgraph only supports 64bits graph";
  CHECK_EQ(hg->relation_graphs().size(), 1)
-    << "halo subgraph only supports homogeneous graph";
+      << "halo subgraph only supports homogeneous graph";
  CHECK_EQ(nodes->dtype.bits, 64)
-    << "halo subgraph only supports 64bits nodes tensor";
+      << "halo subgraph only supports 64bits nodes tensor";
  const dgl_id_t *nid = static_cast<dgl_id_t *>(nodes->data);
  const auto id_len = nodes->shape[0];
  // A map contains all nodes in the subgraph.
@@ -113,8 +116,8 @@ HaloHeteroSubgraph GetSubgraphWithHalo(std::shared_ptr<HeteroGraph> hg,
    const dgl_id_t *eid_data = static_cast<dgl_id_t *>(eid->data);
    for (int64_t i = 0; i < num_edges; i++) {
      auto it1 = orig_nodes.find(src_data[i]);
-      // If the source node is in the partition, we have got this edge when we iterate over
-      // the out-edges above.
+      // If the source node is in the partition, we have got this edge when we
+      // iterate over the out-edges above.
      if (it1 == orig_nodes.end()) {
        edge_src.push_back(src_data[i]);
        edge_dst.push_back(dst_data[i]);
@@ -164,10 +167,10 @@ HaloHeteroSubgraph GetSubgraphWithHalo(std::shared_ptr<HeteroGraph> hg,
  }

  num_edges = edge_src.size();
-  IdArray new_src = IdArray::Empty({num_edges}, DGLDataType{kDGLInt, 64, 1},
-                                   DGLContext{kDGLCPU, 0});
-  IdArray new_dst = IdArray::Empty({num_edges}, DGLDataType{kDGLInt, 64, 1},
-                                   DGLContext{kDGLCPU, 0});
+  IdArray new_src = IdArray::Empty(
+      {num_edges}, DGLDataType{kDGLInt, 64, 1}, DGLContext{kDGLCPU, 0});
+  IdArray new_dst = IdArray::Empty(
+      {num_edges}, DGLDataType{kDGLInt, 64, 1}, DGLContext{kDGLCPU, 0});
  dgl_id_t *new_src_data = static_cast<dgl_id_t *>(new_src->data);
  dgl_id_t *new_dst_data = static_cast<dgl_id_t *>(new_dst->data);
  for (size_t i = 0; i < edge_src.size(); i++) {
@@ -180,8 +183,8 @@ HaloHeteroSubgraph GetSubgraphWithHalo(std::shared_ptr<HeteroGraph> hg,
    dgl_id_t old_nid = old_node_ids[i];
    inner_nodes[i] = all_nodes[old_nid];
  }
-  aten::COOMatrix coo(old_node_ids.size(), old_node_ids.size(), new_src,
-                      new_dst);
+  aten::COOMatrix coo(
+      old_node_ids.size(), old_node_ids.size(), new_src, new_dst);
  HeteroGraphPtr ugptr = UnitGraph::CreateFromCOO(1, coo);
  HeteroGraphPtr subg = CreateHeteroGraph(hg->meta_graph(), {ugptr});
  HaloHeteroSubgraph halo_subg;
@@ -194,83 +197,83 @@ HaloHeteroSubgraph GetSubgraphWithHalo(std::shared_ptr<HeteroGraph> hg,
 }

 DGL_REGISTER_GLOBAL("partition._CAPI_DGLReorderGraph_Hetero")
-  .set_body([](DGLArgs args, DGLRetValue *rv) {
-    HeteroGraphRef g = args[0];
-    auto hgptr = std::dynamic_pointer_cast<HeteroGraph>(g.sptr());
-    CHECK(hgptr) << "Invalid HeteroGraph object";
-    CHECK_EQ(hgptr->relation_graphs().size(), 1)
-      << "Reorder only supports HomoGraph";
-    auto ugptr = hgptr->relation_graphs()[0];
-    const IdArray new_order = args[1];
-    auto reorder_ugptr = ReorderUnitGraph(ugptr, new_order);
-    std::vector<HeteroGraphPtr> rel_graphs = {reorder_ugptr};
-    *rv = HeteroGraphRef(std::make_shared<HeteroGraph>(
-      hgptr->meta_graph(), rel_graphs, hgptr->NumVerticesPerType()));
-  });
+    .set_body([](DGLArgs args, DGLRetValue *rv) {
+      HeteroGraphRef g = args[0];
+      auto hgptr = std::dynamic_pointer_cast<HeteroGraph>(g.sptr());
+      CHECK(hgptr) << "Invalid HeteroGraph object";
+      CHECK_EQ(hgptr->relation_graphs().size(), 1)
+          << "Reorder only supports HomoGraph";
+      auto ugptr = hgptr->relation_graphs()[0];
+      const IdArray new_order = args[1];
+      auto reorder_ugptr = ReorderUnitGraph(ugptr, new_order);
+      std::vector<HeteroGraphPtr> rel_graphs = {reorder_ugptr};
+      *rv = HeteroGraphRef(std::make_shared<HeteroGraph>(
+          hgptr->meta_graph(), rel_graphs, hgptr->NumVerticesPerType()));
+    });

 DGL_REGISTER_GLOBAL("partition._CAPI_DGLPartitionWithHalo_Hetero")
-  .set_body([](DGLArgs args, DGLRetValue *rv) {
-    HeteroGraphRef g = args[0];
-    auto hgptr = std::dynamic_pointer_cast<HeteroGraph>(g.sptr());
-    CHECK(hgptr) << "Invalid HeteroGraph object";
-    CHECK_EQ(hgptr->relation_graphs().size(), 1)
-      << "Metis partition only supports HomoGraph";
-    auto ugptr = hgptr->relation_graphs()[0];
+    .set_body([](DGLArgs args, DGLRetValue *rv) {
+      HeteroGraphRef g = args[0];
+      auto hgptr = std::dynamic_pointer_cast<HeteroGraph>(g.sptr());
+      CHECK(hgptr) << "Invalid HeteroGraph object";
+      CHECK_EQ(hgptr->relation_graphs().size(), 1)
+          << "Metis partition only supports HomoGraph";
+      auto ugptr = hgptr->relation_graphs()[0];

-    IdArray node_parts = args[1];
-    int num_hops = args[2];
+      IdArray node_parts = args[1];
+      int num_hops = args[2];

-    CHECK_EQ(node_parts->dtype.bits, 64)
-      << "Only supports 64bits tensor for now";
+      CHECK_EQ(node_parts->dtype.bits, 64)
+          << "Only supports 64bits tensor for now";

-    const int64_t *part_data = static_cast<int64_t *>(node_parts->data);
-    int64_t num_nodes = node_parts->shape[0];
-    std::unordered_map<int, std::vector<int64_t>> part_map;
-    for (int64_t i = 0; i < num_nodes; i++) {
-      dgl_id_t part_id = part_data[i];
-      auto it = part_map.find(part_id);
-      if (it == part_map.end()) {
-        std::vector<int64_t> vec;
-        vec.push_back(i);
-        part_map[part_id] = vec;
-      } else {
-        it->second.push_back(i);
+      const int64_t *part_data = static_cast<int64_t *>(node_parts->data);
+      int64_t num_nodes = node_parts->shape[0];
+      std::unordered_map<int, std::vector<int64_t>> part_map;
+      for (int64_t i = 0; i < num_nodes; i++) {
+        dgl_id_t part_id = part_data[i];
+        auto it = part_map.find(part_id);
+        if (it == part_map.end()) {
+          std::vector<int64_t> vec;
+          vec.push_back(i);
+          part_map[part_id] = vec;
+        } else {
+          it->second.push_back(i);
+        }
      }
-    }
-    std::vector<int> part_ids;
-    std::vector<std::vector<int64_t>> part_nodes;
-    int max_part_id = 0;
-    for (auto it = part_map.begin(); it != part_map.end(); it++) {
-      max_part_id = std::max(it->first, max_part_id);
-      part_ids.push_back(it->first);
-      part_nodes.push_back(it->second);
-    }
-    // When we construct subgraphs, we need to access both in-edges and out-edges.
-    // We need to make sure the in-CSR and out-CSR exist. Otherwise, we'll
-    // try to construct in-CSR and out-CSR in openmp for loop, which will lead
-    // to some unexpected results.
-    ugptr->GetInCSR();
-    ugptr->GetOutCSR();
-    std::vector<std::shared_ptr<HaloHeteroSubgraph>> subgs(max_part_id + 1);
-    int num_partitions = part_nodes.size();
-    runtime::parallel_for(0, num_partitions, [&](int b, int e) {
-      for (auto i = b; i < e; i++) {
-        auto nodes = aten::VecToIdArray(part_nodes[i]);
-        HaloHeteroSubgraph subg = GetSubgraphWithHalo(hgptr, nodes, num_hops);
-        std::shared_ptr<HaloHeteroSubgraph> subg_ptr(
-          new HaloHeteroSubgraph(subg));
-        int part_id = part_ids[i];
-        subgs[part_id] = subg_ptr;
+      std::vector<int> part_ids;
+      std::vector<std::vector<int64_t>> part_nodes;
+      int max_part_id = 0;
+      for (auto it = part_map.begin(); it != part_map.end(); it++) {
+        max_part_id = std::max(it->first, max_part_id);
+        part_ids.push_back(it->first);
+        part_nodes.push_back(it->second);
+      }
+      // When we construct subgraphs, we need to access both in-edges and
+      // out-edges. We need to make sure the in-CSR and out-CSR exist.
+      // Otherwise, we'll try to construct in-CSR and out-CSR in openmp for
+      // loop, which will lead to some unexpected results.
+      ugptr->GetInCSR();
+      ugptr->GetOutCSR();
+      std::vector<std::shared_ptr<HaloHeteroSubgraph>> subgs(max_part_id + 1);
+      int num_partitions = part_nodes.size();
+      runtime::parallel_for(0, num_partitions, [&](int b, int e) {
+        for (auto i = b; i < e; i++) {
+          auto nodes = aten::VecToIdArray(part_nodes[i]);
+          HaloHeteroSubgraph subg = GetSubgraphWithHalo(hgptr, nodes, num_hops);
+          std::shared_ptr<HaloHeteroSubgraph> subg_ptr(
+              new HaloHeteroSubgraph(subg));
+          int part_id = part_ids[i];
+          subgs[part_id] = subg_ptr;
+        }
+      });
+      List<HeteroSubgraphRef> ret_list;
+      for (size_t i = 0; i < subgs.size(); i++) {
+        ret_list.push_back(HeteroSubgraphRef(subgs[i]));
      }
+      *rv = ret_list;
    });
-    List<HeteroSubgraphRef> ret_list;
-    for (size_t i = 0; i < subgs.size(); i++) {
-      ret_list.push_back(HeteroSubgraphRef(subgs[i]));
-    }
-    *rv = ret_list;
-  });

-template<class IdType>
+template <class IdType>
 struct EdgeProperty {
  IdType eid;
  int64_t idx;
@@ -280,98 +283,101 @@ struct EdgeProperty {
 // Reassign edge IDs so that all edges in a partition have contiguous edge IDs.
 // The original edge IDs are returned.
 DGL_REGISTER_GLOBAL("partition._CAPI_DGLReassignEdges_Hetero")
-  .set_body([](DGLArgs args, DGLRetValue *rv) {
-    HeteroGraphRef g = args[0];
-    auto hgptr = std::dynamic_pointer_cast<HeteroGraph>(g.sptr());
-    CHECK(hgptr) << "Invalid HeteroGraph object";
-    CHECK_EQ(hgptr->relation_graphs().size(), 1)
-      << "Reorder only supports HomoGraph";
-    auto ugptr = hgptr->relation_graphs()[0];
-    IdArray etype = args[1];
-    IdArray part_id = args[2];
-    bool is_incsr = args[3];
-    auto csrmat = is_incsr ? ugptr->GetCSCMatrix(0) : ugptr->GetCSRMatrix(0);
-    int64_t num_edges = csrmat.data->shape[0];
-    int64_t num_rows = csrmat.indptr->shape[0] - 1;
-    IdArray new_data =
-      IdArray::Empty({num_edges}, csrmat.data->dtype, csrmat.data->ctx);
-    // Return the original edge Ids.
-    *rv = new_data;
+    .set_body([](DGLArgs args, DGLRetValue *rv) {
+      HeteroGraphRef g = args[0];
+      auto hgptr = std::dynamic_pointer_cast<HeteroGraph>(g.sptr());
+      CHECK(hgptr) << "Invalid HeteroGraph object";
+      CHECK_EQ(hgptr->relation_graphs().size(), 1)
+          << "Reorder only supports HomoGraph";
+      auto ugptr = hgptr->relation_graphs()[0];
+      IdArray etype = args[1];
+      IdArray part_id = args[2];
+      bool is_incsr = args[3];
+      auto csrmat = is_incsr ? ugptr->GetCSCMatrix(0) : ugptr->GetCSRMatrix(0);
+      int64_t num_edges = csrmat.data->shape[0];
+      int64_t num_rows = csrmat.indptr->shape[0] - 1;
+      IdArray new_data =
+          IdArray::Empty({num_edges}, csrmat.data->dtype, csrmat.data->ctx);
+      // Return the original edge Ids.
+      *rv = new_data;

-    // Generate new edge Ids.
-    ATEN_ID_TYPE_SWITCH(new_data->dtype, IdType, {
-      CHECK(etype->dtype.bits == sizeof(IdType) * 8);
-      CHECK(part_id->dtype.bits == sizeof(IdType) * 8);
-      const IdType *part_id_data = static_cast<IdType *>(part_id->data);
-      const IdType *etype_data = static_cast<IdType *>(etype->data);
-      const IdType *indptr_data = static_cast<IdType *>(csrmat.indptr->data);
-      IdType *typed_data = static_cast<IdType *>(csrmat.data->data);
-      IdType *typed_new_data = static_cast<IdType *>(new_data->data);
-      std::vector<EdgeProperty<IdType>> indexed_eids(num_edges);
-      for (int64_t i = 0; i < num_rows; i++) {
-        for (int64_t j = indptr_data[i]; j < indptr_data[i + 1]; j++) {
-          indexed_eids[j].eid = typed_data[j];
-          indexed_eids[j].idx = j;
-          indexed_eids[j].part_id = part_id_data[i];
+      // Generate new edge Ids.
+      ATEN_ID_TYPE_SWITCH(new_data->dtype, IdType, {
+        CHECK(etype->dtype.bits == sizeof(IdType) * 8);
+        CHECK(part_id->dtype.bits == sizeof(IdType) * 8);
+        const IdType *part_id_data = static_cast<IdType *>(part_id->data);
+        const IdType *etype_data = static_cast<IdType *>(etype->data);
+        const IdType *indptr_data = static_cast<IdType *>(csrmat.indptr->data);
+        IdType *typed_data = static_cast<IdType *>(csrmat.data->data);
+        IdType *typed_new_data = static_cast<IdType *>(new_data->data);
+        std::vector<EdgeProperty<IdType>> indexed_eids(num_edges);
+        for (int64_t i = 0; i < num_rows; i++) {
+          for (int64_t j = indptr_data[i]; j < indptr_data[i + 1]; j++) {
+            indexed_eids[j].eid = typed_data[j];
+            indexed_eids[j].idx = j;
+            indexed_eids[j].part_id = part_id_data[i];
+          }
        }
-      }
-      auto comp = [etype_data](const EdgeProperty<IdType> &a, const EdgeProperty<IdType> &b) {
-        if (a.part_id == b.part_id) {
-          return etype_data[a.eid] < etype_data[b.eid];
-        } else {
-          return a.part_id < b.part_id;
+        auto comp = [etype_data](
+                        const EdgeProperty<IdType> &a,
+                        const EdgeProperty<IdType> &b) {
+          if (a.part_id == b.part_id) {
+            return etype_data[a.eid] < etype_data[b.eid];
+          } else {
+            return a.part_id < b.part_id;
+          }
+        };
+        // We only need to sort the edges if the input graph has multiple
+        // relations. If it's a homogeneous grap, we'll just assign edge Ids
+        // based on its previous order.
+        if (etype->shape[0] > 0) {
+          std::sort(indexed_eids.begin(), indexed_eids.end(), comp);
        }
-      };
-      // We only need to sort the edges if the input graph has multiple relations.
-      // If it's a homogeneous grap, we'll just assign edge Ids based on its previous order.
-      if (etype->shape[0] > 0) {
-        std::sort(indexed_eids.begin(), indexed_eids.end(), comp);
-      }
-      for (int64_t new_eid = 0; new_eid < num_edges; new_eid++) {
-        int64_t orig_idx = indexed_eids[new_eid].idx;
-        typed_new_data[new_eid] = typed_data[orig_idx];
-        typed_data[orig_idx] = new_eid;
-      }
+        for (int64_t new_eid = 0; new_eid < num_edges; new_eid++) {
+          int64_t orig_idx = indexed_eids[new_eid].idx;
+          typed_new_data[new_eid] = typed_data[orig_idx];
+          typed_data[orig_idx] = new_eid;
+        }
+      });
+      ugptr->InvalidateCSR();
+      ugptr->InvalidateCOO();
    });
-    ugptr->InvalidateCSR();
-    ugptr->InvalidateCOO();
-  });

 DGL_REGISTER_GLOBAL("partition._CAPI_GetHaloSubgraphInnerNodes_Hetero")
-  .set_body([](DGLArgs args, DGLRetValue *rv) {
-    HeteroSubgraphRef g = args[0];
-    auto gptr = std::dynamic_pointer_cast<HaloHeteroSubgraph>(g.sptr());
-    CHECK(gptr) << "The input graph has to be HaloHeteroSubgraph";
-    *rv = gptr->inner_nodes[0];
-  });
-
+    .set_body([](DGLArgs args, DGLRetValue *rv) {
+      HeteroSubgraphRef g = args[0];
+      auto gptr = std::dynamic_pointer_cast<HaloHeteroSubgraph>(g.sptr());
+      CHECK(gptr) << "The input graph has to be HaloHeteroSubgraph";
+      *rv = gptr->inner_nodes[0];
+    });

 DGL_REGISTER_GLOBAL("partition._CAPI_DGLMakeSymmetric_Hetero")
-  .set_body([](DGLArgs args, DGLRetValue *rv) {
-    HeteroGraphRef g = args[0];
-    auto hgptr = std::dynamic_pointer_cast<HeteroGraph>(g.sptr());
-    CHECK(hgptr) << "Invalid HeteroGraph object";
-    CHECK_EQ(hgptr->relation_graphs().size(), 1)
-      << "Metis partition only supports homogeneous graph";
-    auto ugptr = hgptr->relation_graphs()[0];
+    .set_body([](DGLArgs args, DGLRetValue *rv) {
+      HeteroGraphRef g = args[0];
+      auto hgptr = std::dynamic_pointer_cast<HeteroGraph>(g.sptr());
+      CHECK(hgptr) << "Invalid HeteroGraph object";
+      CHECK_EQ(hgptr->relation_graphs().size(), 1)
+          << "Metis partition only supports homogeneous graph";
+      auto ugptr = hgptr->relation_graphs()[0];

 #if !defined(_WIN32)
-    // TODO(zhengda) should we get whatever CSR exists in the graph.
-    gk_csr_t *gk_csr = Convert2GKCsr(ugptr->GetCSCMatrix(0), true);
-    gk_csr_t *sym_gk_csr = gk_csr_MakeSymmetric(gk_csr, GK_CSR_SYM_SUM);
-    auto mat = Convert2DGLCsr(sym_gk_csr, true);
-    gk_csr_Free(&gk_csr);
-    gk_csr_Free(&sym_gk_csr);
+      // TODO(zhengda) should we get whatever CSR exists in the graph.
+      gk_csr_t *gk_csr = Convert2GKCsr(ugptr->GetCSCMatrix(0), true);
+      gk_csr_t *sym_gk_csr = gk_csr_MakeSymmetric(gk_csr, GK_CSR_SYM_SUM);
+      auto mat = Convert2DGLCsr(sym_gk_csr, true);
+      gk_csr_Free(&gk_csr);
+      gk_csr_Free(&sym_gk_csr);

-    auto new_ugptr = UnitGraph::CreateFromCSC(ugptr->NumVertexTypes(), mat,
-                                              ugptr->GetAllowedFormats());
-    std::vector<HeteroGraphPtr> rel_graphs = {new_ugptr};
-    *rv = HeteroGraphRef(std::make_shared<HeteroGraph>(
-      hgptr->meta_graph(), rel_graphs, hgptr->NumVerticesPerType()));
+      auto new_ugptr = UnitGraph::CreateFromCSC(
+          ugptr->NumVertexTypes(), mat, ugptr->GetAllowedFormats());
+      std::vector<HeteroGraphPtr> rel_graphs = {new_ugptr};
+      *rv = HeteroGraphRef(std::make_shared<HeteroGraph>(
+          hgptr->meta_graph(), rel_graphs, hgptr->NumVerticesPerType()));
 #else
-    LOG(FATAL) << "The fast version of making symmetric graph is not supported in Windows.";
+      LOG(FATAL) << "The fast version of making symmetric graph is not "
+                    "supported in Windows.";
 #endif  // !defined(_WIN32)
-  });
+    });

 }  // namespace transform
 }  // namespace dgl
--- a/src/graph/transform/remove_edges.cc
+++ b/src/graph/transform/remove_edges.cc
@@ -4,15 +4,16 @@
 * \brief Remove edges.
 */

-#include <dgl/base_heterograph.h>
-#include <dgl/transform.h>
 #include <dgl/array.h>
+#include <dgl/base_heterograph.h>
 #include <dgl/packed_func_ext.h>
-#include <dgl/runtime/registry.h>
 #include <dgl/runtime/container.h>
-#include <vector>
-#include <utility>
+#include <dgl/runtime/registry.h>
+#include <dgl/transform.h>
+
 #include <tuple>
+#include <utility>
+#include <vector>

 namespace dgl {

@@ -21,8 +22,8 @@ using namespace dgl::aten;

 namespace transform {

-std::pair<HeteroGraphPtr, std::vector<IdArray>>
-RemoveEdges(const HeteroGraphPtr graph, const std::vector<IdArray> &eids) {
+std::pair<HeteroGraphPtr, std::vector<IdArray>> RemoveEdges(
+    const HeteroGraphPtr graph, const std::vector<IdArray> &eids) {
  std::vector<IdArray> induced_eids;
  std::vector<HeteroGraphPtr> rel_graphs;
  const int64_t num_etypes = graph->NumEdgeTypes();
@@ -40,23 +41,30 @@ RemoveEdges(const HeteroGraphPtr graph, const std::vector<IdArray> &eids) {
      const COOMatrix &coo = graph->GetCOOMatrix(etype);
      const COOMatrix &result = COORemove(coo, eids[etype]);
      new_rel_graph = CreateFromCOO(
-          num_ntypes_rel, result.num_rows, result.num_cols, result.row, result.col);
+          num_ntypes_rel, result.num_rows, result.num_cols, result.row,
+          result.col);
      induced_eids_rel = result.data;
    } else if (fmt == SparseFormat::kCSR) {
      const CSRMatrix &csr = graph->GetCSRMatrix(etype);
      const CSRMatrix &result = CSRRemove(csr, eids[etype]);
      new_rel_graph = CreateFromCSR(
-          num_ntypes_rel, result.num_rows, result.num_cols, result.indptr, result.indices,
+          num_ntypes_rel, result.num_rows, result.num_cols, result.indptr,
+          result.indices,
          // TODO(BarclayII): make CSR support null eid array
-          Range(0, result.indices->shape[0], result.indices->dtype.bits, result.indices->ctx));
+          Range(
+              0, result.indices->shape[0], result.indices->dtype.bits,
+              result.indices->ctx));
      induced_eids_rel = result.data;
    } else if (fmt == SparseFormat::kCSC) {
      const CSRMatrix &csc = graph->GetCSCMatrix(etype);
      const CSRMatrix &result = CSRRemove(csc, eids[etype]);
      new_rel_graph = CreateFromCSC(
-          num_ntypes_rel, result.num_rows, result.num_cols, result.indptr, result.indices,
+          num_ntypes_rel, result.num_rows, result.num_cols, result.indptr,
+          result.indices,
          // TODO(BarclayII): make CSR support null eid array
-          Range(0, result.indices->shape[0], result.indices->dtype.bits, result.indices->ctx));
+          Range(
+              0, result.indices->shape[0], result.indices->dtype.bits,
+              result.indices->ctx));
      induced_eids_rel = result.data;
    }

@@ -70,24 +78,24 @@ RemoveEdges(const HeteroGraphPtr graph, const std::vector<IdArray> &eids) {
 }

 DGL_REGISTER_GLOBAL("transform._CAPI_DGLRemoveEdges")
-.set_body([] (DGLArgs args, DGLRetValue *rv) {
-    const HeteroGraphRef graph_ref = args[0];
-    const std::vector<IdArray> &eids = ListValueToVector<IdArray>(args[1]);
+    .set_body([](DGLArgs args, DGLRetValue *rv) {
+      const HeteroGraphRef graph_ref = args[0];
+      const std::vector<IdArray> &eids = ListValueToVector<IdArray>(args[1]);

-    HeteroGraphPtr new_graph;
-    std::vector<IdArray> induced_eids;
-    std::tie(new_graph, induced_eids) = RemoveEdges(graph_ref.sptr(), eids);
+      HeteroGraphPtr new_graph;
+      std::vector<IdArray> induced_eids;
+      std::tie(new_graph, induced_eids) = RemoveEdges(graph_ref.sptr(), eids);

-    List<Value> induced_eids_ref;
-    for (IdArray &array : induced_eids)
-      induced_eids_ref.push_back(Value(MakeValue(array)));
+      List<Value> induced_eids_ref;
+      for (IdArray &array : induced_eids)
+        induced_eids_ref.push_back(Value(MakeValue(array)));

-    List<ObjectRef> ret;
-    ret.push_back(HeteroGraphRef(new_graph));
-    ret.push_back(induced_eids_ref);
+      List<ObjectRef> ret;
+      ret.push_back(HeteroGraphRef(new_graph));
+      ret.push_back(induced_eids_ref);

-    *rv = ret;
-  });
+      *rv = ret;
+    });

 };  // namespace transform


--- a/src/graph/transform/to_bipartite.cc
+++ b/src/graph/transform/to_bipartite.cc
@@ -19,16 +19,18 @@

 #include "to_bipartite.h"

-#include <dgl/base_heterograph.h>
-#include <dgl/transform.h>
 #include <dgl/array.h>
-#include <dgl/packed_func_ext.h>
+#include <dgl/base_heterograph.h>
 #include <dgl/immutable_graph.h>
-#include <dgl/runtime/registry.h>
+#include <dgl/packed_func_ext.h>
 #include <dgl/runtime/container.h>
-#include <vector>
+#include <dgl/runtime/registry.h>
+#include <dgl/transform.h>
+
 #include <tuple>
 #include <utility>
+#include <vector>
+
 #include "../../array/cpu/array_utils.h"

 namespace dgl {
@@ -42,11 +44,11 @@ namespace {

 // Since partial specialization is not allowed for functions, use this as an
 // intermediate for ToBlock where XPU = kDGLCPU.
-template<typename IdType>
-std::tuple<HeteroGraphPtr, std::vector<IdArray>>
-ToBlockCPU(HeteroGraphPtr graph, const std::vector<IdArray> &rhs_nodes,
-    bool include_rhs_in_lhs, std::vector<IdArray>* const lhs_nodes_ptr) {
-  std::vector<IdArray>& lhs_nodes = *lhs_nodes_ptr;
+template <typename IdType>
+std::tuple<HeteroGraphPtr, std::vector<IdArray>> ToBlockCPU(
+    HeteroGraphPtr graph, const std::vector<IdArray> &rhs_nodes,
+    bool include_rhs_in_lhs, std::vector<IdArray> *const lhs_nodes_ptr) {
+  std::vector<IdArray> &lhs_nodes = *lhs_nodes_ptr;
  const bool generate_lhs_nodes = lhs_nodes.empty();

  const int64_t num_etypes = graph->NumEdgeTypes();
@@ -54,28 +56,29 @@ ToBlockCPU(HeteroGraphPtr graph, const std::vector<IdArray> &rhs_nodes,
  std::vector<EdgeArray> edge_arrays(num_etypes);

  CHECK(rhs_nodes.size() == static_cast<size_t>(num_ntypes))
-    << "rhs_nodes not given for every node type";
+      << "rhs_nodes not given for every node type";

-  const std::vector<IdHashMap<IdType>> rhs_node_mappings(rhs_nodes.begin(), rhs_nodes.end());
+  const std::vector<IdHashMap<IdType>> rhs_node_mappings(
+      rhs_nodes.begin(), rhs_nodes.end());
  std::vector<IdHashMap<IdType>> lhs_node_mappings;

  if (generate_lhs_nodes) {
-  // build lhs_node_mappings -- if we don't have them already
+    // build lhs_node_mappings -- if we don't have them already
    if (include_rhs_in_lhs)
      lhs_node_mappings = rhs_node_mappings;  // copy
    else
      lhs_node_mappings.resize(num_ntypes);
  } else {
-    lhs_node_mappings = std::vector<IdHashMap<IdType>>(lhs_nodes.begin(), lhs_nodes.end());
+    lhs_node_mappings =
+        std::vector<IdHashMap<IdType>>(lhs_nodes.begin(), lhs_nodes.end());
  }

-
  for (int64_t etype = 0; etype < num_etypes; ++etype) {
    const auto src_dst_types = graph->GetEndpointTypes(etype);
    const dgl_type_t srctype = src_dst_types.first;
    const dgl_type_t dsttype = src_dst_types.second;
    if (!aten::IsNullArray(rhs_nodes[dsttype])) {
-      const EdgeArray& edges = graph->Edges(etype);
+      const EdgeArray &edges = graph->Edges(etype);
      if (generate_lhs_nodes) {
        lhs_node_mappings[srctype].Update(edges.src);
      }
@@ -89,8 +92,8 @@ ToBlockCPU(HeteroGraphPtr graph, const std::vector<IdArray> &rhs_nodes,
  const auto meta_graph = graph->meta_graph();
  const EdgeArray etypes = meta_graph->Edges("eid");
  const IdArray new_dst = Add(etypes.dst, num_ntypes);
-  const auto new_meta_graph = ImmutableGraph::CreateFromCOO(
-      num_ntypes * 2, etypes.src, new_dst);
+  const auto new_meta_graph =
+      ImmutableGraph::CreateFromCOO(num_ntypes * 2, etypes.src, new_dst);

  for (int64_t ntype = 0; ntype < num_ntypes; ++ntype)
    num_nodes_per_type.push_back(lhs_node_mappings[ntype].Size());
@@ -108,8 +111,8 @@ ToBlockCPU(HeteroGraphPtr graph, const std::vector<IdArray> &rhs_nodes,
    if (rhs_map.Size() == 0) {
      // No rhs nodes are given for this edge type. Create an empty graph.
      rel_graphs.push_back(CreateFromCOO(
-          2, lhs_map.Size(), rhs_map.Size(),
-          aten::NullArray(), aten::NullArray()));
+          2, lhs_map.Size(), rhs_map.Size(), aten::NullArray(),
+          aten::NullArray()));
      induced_edges.push_back(aten::NullArray());
    } else {
      IdArray new_src = lhs_map.Map(edge_arrays[etype].src, -1);
@@ -117,22 +120,22 @@ ToBlockCPU(HeteroGraphPtr graph, const std::vector<IdArray> &rhs_nodes,
      // Check whether there are unmapped IDs and raise error.
      for (int64_t i = 0; i < new_dst->shape[0]; ++i)
        CHECK_NE(new_dst.Ptr<IdType>()[i], -1)
-          << "Node " << edge_arrays[etype].dst.Ptr<IdType>()[i] << " does not exist"
-          << " in `rhs_nodes`. Argument `rhs_nodes` must contain all the edge"
-          << " destination nodes.";
-      rel_graphs.push_back(CreateFromCOO(
-          2, lhs_map.Size(), rhs_map.Size(),
-          new_src, new_dst));
+            << "Node " << edge_arrays[etype].dst.Ptr<IdType>()[i]
+            << " does not exist"
+            << " in `rhs_nodes`. Argument `rhs_nodes` must contain all the edge"
+            << " destination nodes.";
+      rel_graphs.push_back(
+          CreateFromCOO(2, lhs_map.Size(), rhs_map.Size(), new_src, new_dst));
      induced_edges.push_back(edge_arrays[etype].id);
    }
  }

-  const HeteroGraphPtr new_graph = CreateHeteroGraph(
-      new_meta_graph, rel_graphs, num_nodes_per_type);
+  const HeteroGraphPtr new_graph =
+      CreateHeteroGraph(new_meta_graph, rel_graphs, num_nodes_per_type);

  if (generate_lhs_nodes) {
    CHECK_EQ(lhs_nodes.size(), 0) << "InteralError: lhs_nodes should be empty "
-        "when generating it.";
+                                     "when generating it.";
    for (const IdHashMap<IdType> &lhs_map : lhs_node_mappings)
      lhs_nodes.push_back(lhs_map.Values());
  }
@@ -141,87 +144,83 @@ ToBlockCPU(HeteroGraphPtr graph, const std::vector<IdArray> &rhs_nodes,

 }  // namespace

-template<>
-std::tuple<HeteroGraphPtr, std::vector<IdArray>>
-ToBlock<kDGLCPU, int32_t>(HeteroGraphPtr graph,
-                         const std::vector<IdArray> &rhs_nodes,
-                         bool include_rhs_in_lhs,
-                         std::vector<IdArray>* const lhs_nodes) {
+template <>
+std::tuple<HeteroGraphPtr, std::vector<IdArray>> ToBlock<kDGLCPU, int32_t>(
+    HeteroGraphPtr graph, const std::vector<IdArray> &rhs_nodes,
+    bool include_rhs_in_lhs, std::vector<IdArray> *const lhs_nodes) {
  return ToBlockCPU<int32_t>(graph, rhs_nodes, include_rhs_in_lhs, lhs_nodes);
 }

-template<>
-std::tuple<HeteroGraphPtr, std::vector<IdArray>>
-ToBlock<kDGLCPU, int64_t>(HeteroGraphPtr graph,
-                         const std::vector<IdArray> &rhs_nodes,
-                         bool include_rhs_in_lhs,
-                         std::vector<IdArray>* const lhs_nodes) {
+template <>
+std::tuple<HeteroGraphPtr, std::vector<IdArray>> ToBlock<kDGLCPU, int64_t>(
+    HeteroGraphPtr graph, const std::vector<IdArray> &rhs_nodes,
+    bool include_rhs_in_lhs, std::vector<IdArray> *const lhs_nodes) {
  return ToBlockCPU<int64_t>(graph, rhs_nodes, include_rhs_in_lhs, lhs_nodes);
 }

 #ifdef DGL_USE_CUDA

-// Forward declaration of GPU ToBlock implementations - actual implementation is in
+// Forward declaration of GPU ToBlock implementations - actual implementation is
+// in
 // ./cuda/cuda_to_block.cu
-// This is to get around the broken name mangling in VS2019 CL 16.5.5 + CUDA 11.3
-// which complains that the two template specializations have the same signature.
-std::tuple<HeteroGraphPtr, std::vector<IdArray>>
-ToBlockGPU32(HeteroGraphPtr, const std::vector<IdArray>&, bool, std::vector<IdArray>* const);
-std::tuple<HeteroGraphPtr, std::vector<IdArray>>
-ToBlockGPU64(HeteroGraphPtr, const std::vector<IdArray>&, bool, std::vector<IdArray>* const);
-
-template<>
-std::tuple<HeteroGraphPtr, std::vector<IdArray>>
-ToBlock<kDGLCUDA, int32_t>(HeteroGraphPtr graph,
-                         const std::vector<IdArray> &rhs_nodes,
-                         bool include_rhs_in_lhs,
-                         std::vector<IdArray>* const lhs_nodes) {
+// This is to get around the broken name mangling in VS2019 CL 16.5.5 +
+// CUDA 11.3 which complains that the two template specializations have the same
+// signature.
+std::tuple<HeteroGraphPtr, std::vector<IdArray>> ToBlockGPU32(
+    HeteroGraphPtr, const std::vector<IdArray> &, bool,
+    std::vector<IdArray> *const);
+std::tuple<HeteroGraphPtr, std::vector<IdArray>> ToBlockGPU64(
+    HeteroGraphPtr, const std::vector<IdArray> &, bool,
+    std::vector<IdArray> *const);
+
+template <>
+std::tuple<HeteroGraphPtr, std::vector<IdArray>> ToBlock<kDGLCUDA, int32_t>(
+    HeteroGraphPtr graph, const std::vector<IdArray> &rhs_nodes,
+    bool include_rhs_in_lhs, std::vector<IdArray> *const lhs_nodes) {
  return ToBlockGPU32(graph, rhs_nodes, include_rhs_in_lhs, lhs_nodes);
 }

-template<>
-std::tuple<HeteroGraphPtr, std::vector<IdArray>>
-ToBlock<kDGLCUDA, int64_t>(HeteroGraphPtr graph,
-                         const std::vector<IdArray> &rhs_nodes,
-                         bool include_rhs_in_lhs,
-                         std::vector<IdArray>* const lhs_nodes) {
+template <>
+std::tuple<HeteroGraphPtr, std::vector<IdArray>> ToBlock<kDGLCUDA, int64_t>(
+    HeteroGraphPtr graph, const std::vector<IdArray> &rhs_nodes,
+    bool include_rhs_in_lhs, std::vector<IdArray> *const lhs_nodes) {
  return ToBlockGPU64(graph, rhs_nodes, include_rhs_in_lhs, lhs_nodes);
 }

 #endif  // DGL_USE_CUDA

 DGL_REGISTER_GLOBAL("transform._CAPI_DGLToBlock")
-.set_body([] (DGLArgs args, DGLRetValue *rv) {
-    const HeteroGraphRef graph_ref = args[0];
-    const std::vector<IdArray> &rhs_nodes = ListValueToVector<IdArray>(args[1]);
-    const bool include_rhs_in_lhs = args[2];
-    std::vector<IdArray> lhs_nodes = ListValueToVector<IdArray>(args[3]);
-
-    HeteroGraphPtr new_graph;
-    std::vector<IdArray> induced_edges;
-
-    ATEN_XPU_SWITCH_CUDA(graph_ref->Context().device_type, XPU, "ToBlock", {
-      ATEN_ID_TYPE_SWITCH(graph_ref->DataType(), IdType, {
-      std::tie(new_graph, induced_edges) = ToBlock<XPU, IdType>(
-          graph_ref.sptr(), rhs_nodes, include_rhs_in_lhs,
-          &lhs_nodes);
+    .set_body([](DGLArgs args, DGLRetValue *rv) {
+      const HeteroGraphRef graph_ref = args[0];
+      const std::vector<IdArray> &rhs_nodes =
+          ListValueToVector<IdArray>(args[1]);
+      const bool include_rhs_in_lhs = args[2];
+      std::vector<IdArray> lhs_nodes = ListValueToVector<IdArray>(args[3]);
+
+      HeteroGraphPtr new_graph;
+      std::vector<IdArray> induced_edges;
+
+      ATEN_XPU_SWITCH_CUDA(graph_ref->Context().device_type, XPU, "ToBlock", {
+        ATEN_ID_TYPE_SWITCH(graph_ref->DataType(), IdType, {
+          std::tie(new_graph, induced_edges) = ToBlock<XPU, IdType>(
+              graph_ref.sptr(), rhs_nodes, include_rhs_in_lhs, &lhs_nodes);
+        });
      });
-    });

-    List<Value> lhs_nodes_ref;
-    for (IdArray &array : lhs_nodes)
-      lhs_nodes_ref.push_back(Value(MakeValue(array)));
-    List<Value> induced_edges_ref;
-    for (IdArray &array : induced_edges)
-      induced_edges_ref.push_back(Value(MakeValue(array)));
+      List<Value> lhs_nodes_ref;
+      for (IdArray &array : lhs_nodes)
+        lhs_nodes_ref.push_back(Value(MakeValue(array)));
+      List<Value> induced_edges_ref;
+      for (IdArray &array : induced_edges)
+        induced_edges_ref.push_back(Value(MakeValue(array)));

-    List<ObjectRef> ret;
-    ret.push_back(HeteroGraphRef(new_graph));
-    ret.push_back(lhs_nodes_ref);
-    ret.push_back(induced_edges_ref);
+      List<ObjectRef> ret;
+      ret.push_back(HeteroGraphRef(new_graph));
+      ret.push_back(lhs_nodes_ref);
+      ret.push_back(induced_edges_ref);

-    *rv = ret;
-  });
+      *rv = ret;
+    });

 };  // namespace transform


--- a/src/graph/transform/to_bipartite.h
+++ b/src/graph/transform/to_bipartite.h
@@ -44,10 +44,10 @@ namespace transform {
 *
 * @return The block and the induced edges.
 */
-template<DGLDeviceType XPU, typename IdType>
-std::tuple<HeteroGraphPtr, std::vector<IdArray>>
-ToBlock(HeteroGraphPtr graph, const std::vector<IdArray> &rhs_nodes,
-        bool include_rhs_in_lhs, std::vector<IdArray>* lhs_nodes);
+template <DGLDeviceType XPU, typename IdType>
+std::tuple<HeteroGraphPtr, std::vector<IdArray>> ToBlock(
+    HeteroGraphPtr graph, const std::vector<IdArray>& rhs_nodes,
+    bool include_rhs_in_lhs, std::vector<IdArray>* lhs_nodes);

 }  // namespace transform
 }  // namespace dgl