[Misc] clang-format auto fix. (#4803)

* [Misc] clang-format auto fix. * manual Co-authored-by: Steve <ubuntu@ip-172-31-34-29.ap-northeast-1.compute.internal>

[Misc] clang-format auto fix. (#4803)
* [Misc] clang-format auto fix. * manual Co-authored-by: Steve <ubuntu@ip-172-31-34-29.ap-northeast-1.compute.internal>
b2d38ca8 · Hongzhi (Steve), Chen · GitHub · 07dc8fb6 · b2d38ca8 · b2d38ca8
Unverified Commit b2d38ca8 authored Nov 02, 2022 by Hongzhi (Steve), Chen Committed by GitHub Nov 02, 2022
20 changed files
--- a/include/dgl/runtime/smart_ptr_serializer.h
+++ b/include/dgl/runtime/smart_ptr_serializer.h
@@ -10,6 +10,7 @@
 #include <dgl/graph_serializer.h>
 #include <dmlc/io.h>
 #include <dmlc/serializer.h>
+
 #include <memory>

 namespace dmlc {

--- a/include/dgl/runtime/tensordispatch.h
+++ b/include/dgl/runtime/tensordispatch.h
 /*!
 *  Copyright (c) 2020-2022 by Contributors
 * \file array/tensordispatch.h
- * \brief This file defines the dispatcher of tensor operators to framework-specific
- *  implementations.
+ * \brief This file defines the dispatcher of tensor operators to
+ * framework-specific implementations.
 *
 *  The dispatcher consists of a TensorDispatcher singleton in DGL C library and
 *  one separately-built shared library per supported backend.
@@ -15,14 +15,14 @@
 *  The TensorDispatcher singleton maintains a mapping from an array operator to
 *  the address of the corresponding symbol in the shared library.  During
 *  initialization, the TensorDispatcher checks which backend DGL is using.
- *  It then locates and opens the corresponding shared library using dlopen(3) (or
- *  LoadLibrary in Windows), and populates the said mapping above with dlsym(3)
- *  (or GetProcAddress in Windows).
+ *  It then locates and opens the corresponding shared library using dlopen(3)
+ * (or LoadLibrary in Windows), and populates the said mapping above with
+ * dlsym(3) (or GetProcAddress in Windows).
 *
- *  A tensor operator in TensorDispatcher first checks whether the corresponding symbol
- *  address is found in the mapping.  If so, it calls the function located at the
- *  symbol address instead, allocate/free pieces of memory on CPU/GPU.
- *  If not, it falls back to DeviceAPI::AllocWorkspace/FreeWorkspace.
+ *  A tensor operator in TensorDispatcher first checks whether the corresponding
+ * symbol address is found in the mapping.  If so, it calls the function located
+ * at the symbol address instead, allocate/free pieces of memory on CPU/GPU. If
+ * not, it falls back to DeviceAPI::AllocWorkspace/FreeWorkspace.
 */

 #ifndef DGL_RUNTIME_TENSORDISPATCH_H_
@@ -38,14 +38,18 @@
 #endif  // DGL_USE_CUDA
 #include "ndarray.h"

-/*! \brief Casts a pointer \c entry to a function pointer with signature of \c func */
-#define FUNCCAST(func, entry)   (*reinterpret_cast<decltype(&(func))>(entry))
+/*!
+ * \brief Casts a pointer \c entry to a function pointer with signature of \c
+ * func.
+ */
+#define FUNCCAST(func, entry) (*reinterpret_cast<decltype(&(func))>(entry))

 namespace dgl {
 namespace runtime {

 /*!
- * \brief Dispatcher that delegates the function calls to framework-specific C++ APIs.
+ * \brief Dispatcher that delegates the function calls to framework-specific C++
+ * APIs.
 *
 * This class is not thread-safe.
 */
@@ -57,17 +61,14 @@ class TensorDispatcher {
    return &inst;
  }

-  /*! \brief Whether an adapter library is available */
-  inline bool IsAvailable() {
-    return available_;
-  }
+  /*! \brief Whether an adapter library is available. */
+  inline bool IsAvailable() { return available_; }

-  /*! \brief Load symbols from the given tensor adapter library path */
-  bool Load(const char *path_cstr);
+  /*! \brief Load symbols from the given tensor adapter library path. */
+  bool Load(const char* path_cstr);

  /*!
-   * \brief Allocate a piece of CPU memory via
-   * PyTorch's CPUAllocator.
+   * \brief Allocate a piece of CPU memory via PyTorch's CPUAllocator.
   * Used in CPUDeviceAPI::AllocWorkspace().
   *
   * \param nbytes The size to be allocated.
@@ -94,7 +95,7 @@ class TensorDispatcher {
   * \brief Allocate a piece of GPU memory via
   * PyTorch's THCCachingAllocator.
   * Used in CUDADeviceAPI::AllocWorkspace().
-   * 
+   *
   * \note THCCachingAllocator specify the device to allocate on
   * via cudaGetDevice(). Make sure to call cudaSetDevice()
   * before invoking this function.
@@ -120,15 +121,15 @@ class TensorDispatcher {
  }

  /*!
-  * \brief Find the current PyTorch CUDA stream
-  * Used in runtime::getCurrentCUDAStream().
-  * 
-  * \note PyTorch pre-allocates/sets the current CUDA stream
-  * on current device via cudaGetDevice(). Make sure to call cudaSetDevice()
-  * before invoking this function.
-  *
-  * \return cudaStream_t stream handle
-  */
+   * \brief Find the current PyTorch CUDA stream
+   * Used in runtime::getCurrentCUDAStream().
+   *
+   * \note PyTorch pre-allocates/sets the current CUDA stream
+   * on current device via cudaGetDevice(). Make sure to call cudaSetDevice()
+   * before invoking this function.
+   *
+   * \return cudaStream_t stream handle
+   */
  inline cudaStream_t CUDAGetCurrentStream() {
    auto entry = entrypoints_[Op::kCUDACurrentStream];
    return FUNCCAST(tensoradapter::CUDACurrentStream, entry)();
@@ -146,8 +147,8 @@ class TensorDispatcher {
  inline void RecordStream(void* ptr, DGLStreamHandle stream, int device_id) {
 #ifdef DGL_USE_CUDA
    auto entry = entrypoints_[Op::kRecordStream];
-    FUNCCAST(tensoradapter::RecordStream, entry)(
-      ptr, static_cast<cudaStream_t>(stream), device_id);
+    FUNCCAST(tensoradapter::RecordStream, entry)
+    (ptr, static_cast<cudaStream_t>(stream), device_id);
 #endif  // DGL_USE_CUDA
  }

@@ -162,14 +163,10 @@ class TensorDispatcher {
   *
   * Must match the functions in tensoradapter/include/tensoradapter.h.
   */
-  static constexpr const char *names_[] = {
-    "CPURawAlloc",
-    "CPURawDelete",
+  static constexpr const char* names_[] = {
+      "CPURawAlloc",  "CPURawDelete",
 #ifdef DGL_USE_CUDA
-    "CUDARawAlloc",
-    "CUDARawDelete",
-    "CUDACurrentStream",
-    "RecordStream",
+      "CUDARawAlloc", "CUDARawDelete", "CUDACurrentStream", "RecordStream",
 #endif  // DGL_USE_CUDA
  };

@@ -191,13 +188,9 @@ class TensorDispatcher {

  /*! \brief Entrypoints of each function */
  void* entrypoints_[num_entries_] = {
-    nullptr,
-    nullptr,
+      nullptr, nullptr,
 #ifdef DGL_USE_CUDA
-    nullptr,
-    nullptr,
-    nullptr,
-    nullptr,
+      nullptr, nullptr, nullptr, nullptr,
 #endif  // DGL_USE_CUDA
  };


--- a/include/dgl/runtime/threading_backend.h
+++ b/include/dgl/runtime/threading_backend.h
@@ -22,26 +22,26 @@ class ThreadGroup {
 public:
  class Impl;

-   /*!
-    * \brief Creates a collection of threads which run a provided function.
-    *
-    * \param num_workers The total number of worker threads in this group.
-             Includes main thread if `exclude_worker0 = true`
-    * \param worker_callback A callback which is run in its own thread.
-             Receives the worker_id as an argument.
-    * \param exclude_worker0 Whether to use the main thread as a worker.
-    *        If  `true`, worker0 will not be launched in a new thread and
-    *        `worker_callback` will only be called for values >= 1. This
-    *        allows use of the main thread as a worker.
-    */
-  ThreadGroup(int num_workers,
-              std::function<void(int)> worker_callback,
-              bool exclude_worker0 = false);
+  /*!
+   * \brief Creates a collection of threads which run a provided function.
+   *
+   * \param num_workers The total number of worker threads in this group.
+            Includes main thread if `exclude_worker0 = true`
+   * \param worker_callback A callback which is run in its own thread.
+            Receives the worker_id as an argument.
+   * \param exclude_worker0 Whether to use the main thread as a worker.
+   *        If  `true`, worker0 will not be launched in a new thread and
+   *        `worker_callback` will only be called for values >= 1. This
+   *        allows use of the main thread as a worker.
+   */
+  ThreadGroup(
+      int num_workers, std::function<void(int)> worker_callback,
+      bool exclude_worker0 = false);
  ~ThreadGroup();

-   /*!
-    * \brief Blocks until all non-main threads in the pool finish.
-    */
+  /*!
+   * \brief Blocks until all non-main threads in the pool finish.
+   */
  void Join();

  enum AffinityMode : int {
@@ -70,8 +70,8 @@ class ThreadGroup {
 /*!
 * \brief Platform-agnostic no-op.
 */
-// This used to be Yield(), renaming to YieldThread() because windows.h defined it as a
-// macro in later SDKs.
+// This used to be Yield(), renaming to YieldThread() because windows.h defined
+// it as a macro in later SDKs.
 void YieldThread();

 /*!
@@ -79,7 +79,6 @@ void YieldThread();
 */
 int MaxConcurrency();

-
 }  // namespace threading
 }  // namespace runtime
 }  // namespace dgl

--- a/include/dgl/sampler.h
+++ b/include/dgl/sampler.h
@@ -6,10 +6,11 @@
 #ifndef DGL_SAMPLER_H_
 #define DGL_SAMPLER_H_

-#include <vector>
-#include <string>
 #include <cstdlib>
 #include <ctime>
+#include <string>
+#include <vector>
+
 #include "graph_interface.h"
 #include "nodeflow.h"

@@ -32,13 +33,11 @@ class SamplerOp {
   * \param probability the transition probability (float/double).
   * \return a NodeFlow graph.
   */
-  template<typename ValueType>
-  static NodeFlow NeighborSample(const ImmutableGraph *graph,
-                                 const std::vector<dgl_id_t>& seeds,
-                                 const std::string &edge_type,
-                                 int num_hops, int expand_factor,
-                                 const bool add_self_loop,
-                                 const ValueType *probability);
+  template <typename ValueType>
+  static NodeFlow NeighborSample(
+      const ImmutableGraph *graph, const std::vector<dgl_id_t> &seeds,
+      const std::string &edge_type, int num_hops, int expand_factor,
+      const bool add_self_loop, const ValueType *probability);

  /*!
   * \brief Sample a graph from the seed vertices with layer sampling.
@@ -50,10 +49,9 @@ class SamplerOp {
   * \param layer_sizes The size of layers.
   * \return a NodeFlow graph.
   */
-  static NodeFlow LayerUniformSample(const ImmutableGraph *graph,
-                                     const std::vector<dgl_id_t>& seeds,
-                                     const std::string &neigh_type,
-                                     IdArray layer_sizes);
+  static NodeFlow LayerUniformSample(
+      const ImmutableGraph *graph, const std::vector<dgl_id_t> &seeds,
+      const std::string &neigh_type, IdArray layer_sizes);
 };

 }  // namespace dgl

--- a/include/dgl/sampling/negative.h
+++ b/include/dgl/sampling/negative.h
@@ -6,40 +6,37 @@
 #ifndef DGL_SAMPLING_NEGATIVE_H_
 #define DGL_SAMPLING_NEGATIVE_H_

-#include <dgl/base_heterograph.h>
 #include <dgl/array.h>
+#include <dgl/base_heterograph.h>
+
 #include <utility>

 namespace dgl {
 namespace sampling {

 /*!
- * \brief Given an edge type, uniformly sample source-destination pairs that do not have
- * an edge in between using rejection sampling.
+ * \brief Given an edge type, uniformly sample source-destination pairs that do
+ * not have an edge in between using rejection sampling.
 *
- * \note This function may not return the same number of elements as the given number
- * of samples.
- * \note This function requires sorting the CSR or CSC matrix of the graph in-place.  It
- * prefers CSC over CSR.
+ * \note This function may not return the same number of elements as the given
+ * number of samples.
+ * \note This function requires sorting the CSR or CSC matrix of the graph
+ * in-place.  It prefers CSC over CSR.
 *
 * \param hg The graph.
 * \param etype The edge type.
 * \param num_samples The number of negative examples to sample.
 * \param num_trials The number of rejection sampling trials.
- * \param exclude_self_loops Do not include the examples where the source equals the
- *        destination.
+ * \param exclude_self_loops Do not include the examples where the source equals
+ * the destination.
 * \param replace Whether to sample with replacement.
- * \param redundancy How much redundant negative examples to take in case of duplicate examples.
+ * \param redundancy How much redundant negative examples to take in case of
+ * duplicate examples.
 * \return The pair of source and destination tensors.
 */
 std::pair<IdArray, IdArray> GlobalUniformNegativeSampling(
-    HeteroGraphPtr hg,
-    dgl_type_t etype,
-    int64_t num_samples,
-    int num_trials,
-    bool exclude_self_loops,
-    bool replace,
-    double redundancy);
+    HeteroGraphPtr hg, dgl_type_t etype, int64_t num_samples, int num_trials,
+    bool exclude_self_loops, bool replace, double redundancy);

 };  // namespace sampling
 };  // namespace dgl

--- a/include/dgl/sampling/neighbor.h
+++ b/include/dgl/sampling/neighbor.h
@@ -6,81 +6,75 @@
 #ifndef DGL_SAMPLING_NEIGHBOR_H_
 #define DGL_SAMPLING_NEIGHBOR_H_

-#include <dgl/base_heterograph.h>
 #include <dgl/array.h>
+#include <dgl/base_heterograph.h>
+
 #include <vector>

 namespace dgl {
 namespace sampling {

 /*!
- * \brief Sample from the neighbors of the given nodes and return the sampled edges as a graph.
+ * \brief Sample from the neighbors of the given nodes and return the sampled
+ * edges as a graph.
 *
- * When sampling with replacement, the sampled subgraph could have parallel edges.
+ * When sampling with replacement, the sampled subgraph could have parallel
+ * edges.
 *
 * For sampling without replace, if fanout > the number of neighbors, all the
 * neighbors will be sampled.
 *
 * \param hg The input graph.
- * \param nodes Node IDs of each type. The vector length must be equal to the number
- *              of node types. Empty array is allowed.
- * \param fanouts Number of sampled neighbors for each edge type. The vector length
- *                should be equal to the number of edge types, or one if they all
- *                have the same fanout.
+ * \param nodes Node IDs of each type. The vector length must be equal to the
+ * number of node types. Empty array is allowed.
+ * \param fanouts Number of sampled neighbors for each edge type. The vector
+ * length should be equal to the number of edge types, or one if they all have
+ * the same fanout.
 * \param dir Edge direction.
- * \param probability A vector of 1D float arrays, indicating the transition probability of
- *        each edge by edge type.  An empty float array assumes uniform transition.
- * \param exclude_edges Edges IDs of each type which will be excluded during sampling.
- *        The vector length must be equal to the number of edges types. Empty array is allowed.
+ * \param probability A vector of 1D float arrays, indicating the transition
+ * probability of each edge by edge type.  An empty float array assumes uniform
+ * transition.
+ * \param exclude_edges Edges IDs of each type which will be excluded during
+ * sampling. The vector length must be equal to the number of edges types. Empty
+ * array is allowed.
 * \param replace If true, sample with replacement.
- * \return Sampled neighborhoods as a graph. The return graph has the same schema as the
- *         original one.
+ * \return Sampled neighborhoods as a graph. The return graph has the same
+ * schema as the original one.
 */
 HeteroSubgraph SampleNeighbors(
-    const HeteroGraphPtr hg,
-    const std::vector<IdArray>& nodes,
-    const std::vector<int64_t>& fanouts,
-    EdgeDir dir,
+    const HeteroGraphPtr hg, const std::vector<IdArray>& nodes,
+    const std::vector<int64_t>& fanouts, EdgeDir dir,
    const std::vector<FloatArray>& probability,
-    const std::vector<IdArray>& exclude_edges,
-    bool replace = true);
+    const std::vector<IdArray>& exclude_edges, bool replace = true);

 /*!
- * Select the neighbors with k-largest weights on the connecting edges for each given node.
+ * Select the neighbors with k-largest weights on the connecting edges for each
+ * given node.
 *
 * If k > the number of neighbors, all the neighbors are sampled.
 *
 * \param hg The input graph.
- * \param nodes Node IDs of each type. The vector length must be equal to the number
- *              of node types. Empty array is allowed.
- * \param k The k value for each edge type. The vector length
-*           should be equal to the number of edge types, or one if they all
-*           have the same fanout.
+ * \param nodes Node IDs of each type. The vector length must be equal to the
+ * number of node types. Empty array is allowed.
+ * \param k The k value for each edge type. The vector length should be equal to
+ * the number of edge types, or one if they all have the same fanout.
 * \param dir Edge direction.
- * \param weight A vector of 1D float arrays, indicating the weights associated with
- *               each edge.
- * \param ascending If true, elements are sorted by ascending order, equivalent to find
- *                  the K smallest values. Otherwise, find K largest values.
- * \return Sampled neighborhoods as a graph. The return graph has the same schema as the
- *         original one.
+ * \param weight A vector of 1D float arrays, indicating the weights associated
+ * witheach edge.
+ * \param ascending If true, elements are sorted by ascending order, equivalent
+ * to find the K smallest values. Otherwise, find K largest values.
+ * \return Sampled neighborhoods as a graph. The return graph has the same
+ * schema as the original one.
 */
 HeteroSubgraph SampleNeighborsTopk(
-    const HeteroGraphPtr hg,
-    const std::vector<IdArray>& nodes,
-    const std::vector<int64_t>& k,
-    EdgeDir dir,
-    const std::vector<FloatArray>& weight,
-    bool ascending = false);
+    const HeteroGraphPtr hg, const std::vector<IdArray>& nodes,
+    const std::vector<int64_t>& k, EdgeDir dir,
+    const std::vector<FloatArray>& weight, bool ascending = false);

 HeteroSubgraph SampleNeighborsBiased(
-    const HeteroGraphPtr hg,
-    const IdArray& nodes,
-    const int64_t fanouts,
-    const NDArray& bias,
-    const NDArray& tag_offset,
-    const EdgeDir dir,
-    const bool replace
-);
+    const HeteroGraphPtr hg, const IdArray& nodes, const int64_t fanouts,
+    const NDArray& bias, const NDArray& tag_offset, const EdgeDir dir,
+    const bool replace);
 }  // namespace sampling
 }  // namespace dgl


--- a/include/dgl/sampling/randomwalks.h
+++ b/include/dgl/sampling/randomwalks.h
@@ -6,11 +6,12 @@
 #ifndef DGL_SAMPLING_RANDOMWALKS_H_
 #define DGL_SAMPLING_RANDOMWALKS_H_

-#include <dgl/base_heterograph.h>
 #include <dgl/array.h>
-#include <vector>
-#include <utility>
+#include <dgl/base_heterograph.h>
+
 #include <tuple>
+#include <utility>
+#include <vector>

 namespace dgl {

@@ -19,71 +20,67 @@ namespace sampling {
 /*!
 * \brief Metapath-based random walk.
 * \param hg The heterograph.
- * \param seeds A 1D array of seed nodes, with the type the source type of the first
- *        edge type in the metapath.
+ * \param seeds A 1D array of seed nodes, with the type the source type of the
+ * first edge type in the metapath.
 * \param metapath A 1D array of edge types representing the metapath.
- * \param prob A vector of 1D float arrays, indicating the transition probability of
- *        each edge by edge type.  An empty float array assumes uniform transition.
+ * \param prob A vector of 1D float arrays, indicating the transition
+ * probability of each edge by edge type. An empty float array assumes uniform
+ * transition.
 * \return A pair of
- *         1. One 2D array of shape (len(seeds), len(metapath) + 1) with node IDs.  The
- *            paths that terminated early are padded with -1.
- *         2. One 2D array of shape (len(seeds), len(metapath)) with edge IDs.  The
- *            paths that terminated early are padded with -1.
+ *         1. One 2D array of shape (len(seeds), len(metapath) + 1) with node
+ *            IDs. The paths that terminated early are padded with -1.
+ *         2. One 2D array of shape (len(seeds), len(metapath)) with edge IDs.
+ *            The paths that terminated early are padded with -1.
 *         3. One 1D array of shape (len(metapath) + 1) with node type IDs.
 */
 std::tuple<IdArray, IdArray, TypeArray> RandomWalk(
-    const HeteroGraphPtr hg,
-    const IdArray seeds,
-    const TypeArray metapath,
+    const HeteroGraphPtr hg, const IdArray seeds, const TypeArray metapath,
    const std::vector<FloatArray> &prob);

 /*!
 * \brief Metapath-based random walk with restart probability.
 * \param hg The heterograph.
- * \param seeds A 1D array of seed nodes, with the type the source type of the first
- *        edge type in the metapath.
+ * \param seeds A 1D array of seed nodes, with the type the source type of the
+ * first edge type in the metapath.
 * \param metapath A 1D array of edge types representing the metapath.
- * \param prob A vector of 1D float arrays, indicating the transition probability of
- *        each edge by edge type.  An empty float array assumes uniform transition.
- * \param restart_prob Restart probability
+ * \param prob A vector of 1D float arrays, indicating the transition
+ * probability of each edge by edge type. An empty float array assumes uniform
+ * transition.
+ * \param restart_prob Restart probability.
 * \return A pair of
- *         1. One 2D array of shape (len(seeds), len(metapath) + 1) with node IDs.  The
- *            paths that terminated early are padded with -1.
- *         2. One 2D array of shape (len(seeds), len(metapath)) with edge IDs.  The
- *            paths that terminated early are padded with -1.
+ *         1. One 2D array of shape (len(seeds), len(metapath) + 1) with node
+ *            IDs. The paths that terminated early are padded with -1.
+ *         2. One 2D array of shape (len(seeds), len(metapath)) with edge IDs.
+ *            The paths that terminated early are padded with -1.
 *         3. One 1D array of shape (len(metapath) + 1) with node type IDs.
 */
 std::tuple<IdArray, IdArray, TypeArray> RandomWalkWithRestart(
-    const HeteroGraphPtr hg,
-    const IdArray seeds,
-    const TypeArray metapath,
-    const std::vector<FloatArray> &prob,
-    double restart_prob);
+    const HeteroGraphPtr hg, const IdArray seeds, const TypeArray metapath,
+    const std::vector<FloatArray> &prob, double restart_prob);

 /*!
- * \brief Metapath-based random walk with stepwise restart probability.  Useful
+ * \brief Metapath-based random walk with stepwise restart probability. Useful
 *        for PinSAGE-like models.
 * \param hg The heterograph.
- * \param seeds A 1D array of seed nodes, with the type the source type of the first
- *        edge type in the metapath.
+ * \param seeds A 1D array of seed nodes, with the type the source type of the
+ * first edge type in the metapath.
 * \param metapath A 1D array of edge types representing the metapath.
- * \param prob A vector of 1D float arrays, indicating the transition probability of
- *        each edge by edge type.  An empty float array assumes uniform transition.
- * \param restart_prob Restart probability array which has the same number of elements
- *        as \c metapath, indicating the probability to terminate after transition.
+ * \param prob A vector of 1D float arrays, indicating the transition
+ * probability of each edge by edge type. An empty float array assumes uniform
+ * transition.
+ * \param restart_prob Restart probability array which has the same number of
+ * elements as \c metapath, indicating the probability to terminate after
+ * transition.
 * \return A pair of
- *         1. One 2D array of shape (len(seeds), len(metapath) + 1) with node IDs.  The
- *            paths that terminated early are padded with -1.
- *         2. One 2D array of shape (len(seeds), len(metapath)) with edge IDs.  The
- *            paths that terminated early are padded with -1.
+ *         1. One 2D array of shape (len(seeds), len(metapath) + 1) with node
+ *            IDs. The paths that terminated early are padded with -1.
+ *         2. One 2D array of shape (len(seeds), len(metapath)) with edge IDs.
+ *            The paths that terminated early are padded with -1.
 *         3. One 1D array of shape (len(metapath) + 1) with node type IDs.
 */
 std::tuple<IdArray, IdArray, TypeArray> RandomWalkWithStepwiseRestart(
-    const HeteroGraphPtr hg,
-    const IdArray seeds,
-    const TypeArray metapath,
-    const std::vector<FloatArray> &prob,
-    FloatArray restart_prob);
+    const HeteroGraphPtr hg, const IdArray seeds, const TypeArray metapath,
+    const std::vector<FloatArray> &prob, FloatArray restart_prob);

 };  // namespace sampling


--- a/include/dgl/scheduler.h
+++ b/include/dgl/scheduler.h
@@ -7,6 +7,7 @@
 #define DGL_SCHEDULER_H_

 #include <vector>
+
 #include "runtime/ndarray.h"

 namespace dgl {
@@ -21,8 +22,8 @@ namespace sched {
 * \param msg_ids The edge id for each message
 * \param vids The destination vertex for each message
 * \param recv_ids The recv nodes (for checking zero degree nodes)
- * \note If there are multiple messages going into the same destination vertex, then
- *       there will be multiple copies of the destination vertex in vids
+ * \note If there are multiple messages going into the same destination vertex,
+ *       then there will be multiple copies of the destination vertex in vids.
 * \return a vector of 5 IdArrays for degree bucketing. The 5 arrays are:
 *         degrees: degrees for each bucket
 *         nids: destination node ids
@@ -31,8 +32,8 @@ namespace sched {
 *         mid_section: number of messages in each bucket (used to split mids)
 */
 template <class IdType>
-std::vector<IdArray> DegreeBucketing(const IdArray& msg_ids, const IdArray& vids,
-        const IdArray& recv_ids);
+std::vector<IdArray> DegreeBucketing(
+    const IdArray& msg_ids, const IdArray& vids, const IdArray& recv_ids);

 /*!
 * \brief Generate degree bucketing schedule for group_apply edge
@@ -53,8 +54,8 @@ std::vector<IdArray> DegreeBucketing(const IdArray& msg_ids, const IdArray& vids
 *                   new_uids, new_vids, and new_eids)
 */
 template <class IdType>
-std::vector<IdArray> GroupEdgeByNodeDegree(const IdArray& uids,
-        const IdArray& vids, const IdArray& eids);
+std::vector<IdArray> GroupEdgeByNodeDegree(
+    const IdArray& uids, const IdArray& vids, const IdArray& eids);

 }  // namespace sched


--- a/include/dgl/transform.h
+++ b/include/dgl/transform.h
@@ -7,50 +7,51 @@
 #ifndef DGL_TRANSFORM_H_
 #define DGL_TRANSFORM_H_

-#include <vector>
 #include <tuple>
 #include <utility>
-#include "base_heterograph.h"
+#include <vector>
+
 #include "array.h"
+#include "base_heterograph.h"

 namespace dgl {

 namespace transform {

 /*!
- * \brief Given a list of graphs, remove the common nodes that do not have inbound and
- * outbound edges.
+ * \brief Given a list of graphs, remove the common nodes that do not have
+ * inbound and outbound edges.
 *
- * The graphs should have identical node ID space (i.e. should have the same set of nodes,
- * including types and IDs).
+ * The graphs should have identical node ID space (i.e. should have the same set
+ * of nodes, including types and IDs).
 *
 * \param graphs The list of graphs.
- * \param always_preserve The list of nodes to preserve regardless of whether the inbound
- *                        or outbound edges exist.
+ * \param always_preserve The list of nodes to preserve regardless of whether
+ * the inbound or outbound edges exist.
 *
- * \return A pair.  The first element is the list of compacted graphs, and the second
- * element is the mapping from the compacted graphs and the original graph.
+ * \return A pair.  The first element is the list of compacted graphs, and the
+ * second element is the mapping from the compacted graphs and the original
+ * graph.
 */
-std::pair<std::vector<HeteroGraphPtr>, std::vector<IdArray>>
-CompactGraphs(
+std::pair<std::vector<HeteroGraphPtr>, std::vector<IdArray>> CompactGraphs(
    const std::vector<HeteroGraphPtr> &graphs,
    const std::vector<IdArray> &always_preserve);

 /*!
 * \brief Convert a graph into a bipartite-structured graph for message passing.
 *
- * Specifically, we create one node type \c ntype_l on the "left" side and another
- * node type \c ntype_r on the "right" side for each node type \c ntype.  The nodes of
- * type \c ntype_r would contain the nodes designated by the caller, and node type
- * \c ntype_l would contain the nodes that has an edge connecting to one of the
- * designated nodes.
+ * Specifically, we create one node type \c ntype_l on the "left" side and
+ * another node type \c ntype_r on the "right" side for each node type \c ntype.
+ * The nodes of type \c ntype_r would contain the nodes designated by the
+ * caller, and node type \c ntype_l would contain the nodes that has an edge
+ * connecting to one of the designated nodes.
 *
 * The nodes of \c ntype_l would also contain the nodes in node type \c ntype_r.
 *
- * This function is often used for constructing a series of dependency graphs for
- * multi-layer message passing, where we first construct a series of frontier graphs
- * on the original node space, and run the following to get the bipartite graph needed
- * for message passing with each GNN layer:
+ * This function is often used for constructing a series of dependency graphs
+ * for multi-layer message passing, where we first construct a series of
+ * frontier graphs on the original node space, and run the following to get the
+ * bipartite graph needed for message passing with each GNN layer:
 *
 * <code>
 *     bipartites = [None] * len(num_layers)
@@ -66,20 +67,21 @@ CompactGraphs(
 *
 * \param graph The graph.
 * \param rhs_nodes Designated nodes that would appear on the right side.
- * \param include_rhs_in_lhs If false, do not include the nodes of node type \c ntype_r
- *        in \c ntype_l.
+ * \param include_rhs_in_lhs If false, do not include the nodes of node type \c
+ * ntype_r in \c ntype_l.
 *
 * \return A triplet containing
 *         * The bipartite-structured graph,
 *         * The induced node from the left side for each graph,
 *         * The induced edges.
 *
- * \note If include_rhs_in_lhs is true, then for each node type \c ntype, the nodes
- *       in rhs_nodes[ntype] would always appear first in the nodes of type \c ntype_l
- *       in the new graph.
+ * \note If include_rhs_in_lhs is true, then for each node type \c ntype, the
+ * nodes in rhs_nodes[ntype] would always appear first in the nodes of type \c
+ * ntype_l in the new graph.
 */
-std::tuple<HeteroGraphPtr, std::vector<IdArray>, std::vector<IdArray>>
-ToBlock(HeteroGraphPtr graph, const std::vector<IdArray> &rhs_nodes, bool include_rhs_in_lhs);
+std::tuple<HeteroGraphPtr, std::vector<IdArray>, std::vector<IdArray>> ToBlock(
+    HeteroGraphPtr graph, const std::vector<IdArray> &rhs_nodes,
+    bool include_rhs_in_lhs);

 /*!
 * \brief Convert a multigraph to a simple graph.
@@ -87,7 +89,8 @@ ToBlock(HeteroGraphPtr graph, const std::vector<IdArray> &rhs_nodes, bool includ
 * \return A triplet of
 * * \c hg : The said simple graph.
 * * \c count : The array of edge occurrences per edge type.
- * * \c edge_map : The mapping from original edge IDs to new edge IDs per edge type.
+ * * \c edge_map : The mapping from original edge IDs to new edge IDs per edge
+ * type.
 *
 * \note Example: consider a graph with the following edges
 *
@@ -99,13 +102,14 @@ ToBlock(HeteroGraphPtr graph, const std::vector<IdArray> &rhs_nodes, bool includ
 *
 *       [(0, 1), (1, 3), (1, 4), (2, 2)]
 *
- * * The second element is an array \c count.  \c count[i] stands for the number of edges
- *   connecting simple_g.src[i] and simple_g.dst[i] in the original graph.
+ * * The second element is an array \c count.  \c count[i] stands for the number
+ * of edges connecting simple_g.src[i] and simple_g.dst[i] in the original
+ * graph.
 *
 *       count[0] = [1, 2, 2, 1]
 *
- * * One can find the mapping between edges from the original graph to the new simple
- *   graph.
+ * * One can find the mapping between edges from the original graph to the new
+ * simple graph.
 *
 *       edge_map[0] = [0, 1, 3, 1, 2, 2]
 */
@@ -118,11 +122,11 @@ ToSimpleGraph(const HeteroGraphPtr graph);
 * \param graph The graph.
 * \param eids The edge IDs to remove per edge type.
 *
- * \return A pair of the graph with edges removed, as well as the edge ID mapping from
- *         the original graph to the new graph per edge type.
+ * \return A pair of the graph with edges removed, as well as the edge ID
+ * mapping from the original graph to the new graph per edge type.
 */
-std::pair<HeteroGraphPtr, std::vector<IdArray>>
-RemoveEdges(const HeteroGraphPtr graph, const std::vector<IdArray> &eids);
+std::pair<HeteroGraphPtr, std::vector<IdArray>> RemoveEdges(
+    const HeteroGraphPtr graph, const std::vector<IdArray> &eids);

 };  // namespace transform


--- a/include/dgl/zerocopy_serializer.h
+++ b/include/dgl/zerocopy_serializer.h
@@ -12,12 +12,12 @@
 #include <dmlc/serializer.h>

 #include <deque>
+#include <memory>
 #include <queue>
 #include <string>
 #include <tuple>
 #include <utility>
 #include <vector>
-#include <memory>

 #include "dmlc/logging.h"

@@ -85,8 +85,9 @@ class StreamWithBuffer : public dmlc::SeekStream {
   * // Read from remote sended pointer list
   * StreamWithBuffer buf_strm(&blob, data_ptr_list)
   */
-  StreamWithBuffer(std::unique_ptr<dmlc::SeekStream> strm,
-                   const std::vector<void*>& data_ptr_list)
+  StreamWithBuffer(
+      std::unique_ptr<dmlc::SeekStream> strm,
+      const std::vector<void*>& data_ptr_list)
      : strm_(std::move(strm)), send_to_remote_(true) {
    for (void* data : data_ptr_list) {
      buffer_list_.emplace_back(data);
@@ -136,8 +137,8 @@ class StreamWithBuffer : public dmlc::SeekStream {
   * \param size buffer size
   * \param data_ptr_list pointer list for NDArrays to deconstruct from
   */
-  StreamWithBuffer(char* p_buffer, size_t size,
-                   const std::vector<void*>& data_ptr_list)
+  StreamWithBuffer(
+      char* p_buffer, size_t size, const std::vector<void*>& data_ptr_list)
      : strm_(new dmlc::MemoryFixedSizeStream(p_buffer, size)),
        send_to_remote_(true) {
    for (void* data : data_ptr_list) {

--- a/include/intel/cpu_support.h
+++ b/include/intel/cpu_support.h
@@ -9,6 +9,7 @@
 #include <memory>
 #include <tuple>
 #include <type_traits>
+
 #include "dmlc/logging.h"
 #include "meta_utils.h"
 #include "xbyak/xbyak.h"
@@ -61,10 +62,10 @@ class ElemWiseAddUpdate : public Xbyak::CodeGenerator {
 public:
  typedef typename Op::type DType;
  static_assert(
-    std::is_base_of<std::true_type,
-                    utils::has_type<DType, supported_types>>::value,
-    "Use case fail dgl::ElemWiseAddUpdate< Operator<DType> > DType is not "
-    "supported !");
+      std::is_base_of<
+          std::true_type, utils::has_type<DType, supported_types>>::value,
+      "Use case fail dgl::ElemWiseAddUpdate< Operator<DType> > DType is not "
+      "supported !");

 protected:
  const Xbyak::Reg64 &r_out_;
@@ -80,77 +81,86 @@ class ElemWiseAddUpdate : public Xbyak::CodeGenerator {
  static constexpr int BITS_IN_BYTES = 8;
  static constexpr int REG_BIT_SIZE = 512;
  static constexpr int UNIT_PER_REG =
-    REG_BIT_SIZE / (UNIT_SIZE_BYTES * BITS_IN_BYTES);
+      REG_BIT_SIZE / (UNIT_SIZE_BYTES * BITS_IN_BYTES);

-  template <class TType, class R1, class R2,
-            utils::CheckCmp<TType, float> = true>
+  template <
+      class TType, class R1, class R2, utils::CheckCmp<TType, float> = true>
  void alias_load(R1 r1, R2 r2) {
    vmovups(r1, r2);
  }
-  template <class TType, class R1, class R2,
-            utils::CheckCmp<TType, double> = true>
+  template <
+      class TType, class R1, class R2, utils::CheckCmp<TType, double> = true>
  void alias_load(R1 r1, R2 r2) {
    vmovupd(r1, r2);
  }

-  template <class TType, class R1, class R2,
-            utils::CheckCmp<TType, float> = true>
+  template <
+      class TType, class R1, class R2, utils::CheckCmp<TType, float> = true>
  void alias_save(R1 r1, R2 r2) {
    alias_load<TType>(r1, r2);
  }
-  template <class TType, class R1, class R2,
-            utils::CheckCmp<TType, double> = true>
+  template <
+      class TType, class R1, class R2, utils::CheckCmp<TType, double> = true>
  void alias_save(R1 r1, R2 r2) {
    alias_load<TType>(r1, r2);
  }

-  template <class TType, class R1, class R2, class R3,
-            utils::CheckCmp<TType, float> = true>
+  template <
+      class TType, class R1, class R2, class R3,
+      utils::CheckCmp<TType, float> = true>
  void alias_ADD(R1 r1, R2 r2, R3 r3) {
    vaddps(r1, r2, r3);
  }
-  template <class TType, class R1, class R2, class R3,
-            utils::CheckCmp<TType, double> = true>
+  template <
+      class TType, class R1, class R2, class R3,
+      utils::CheckCmp<TType, double> = true>
  void alias_ADD(R1 r1, R2 r2, R3 r3) {
    vaddpd(r1, r2, r3);
  }

-  template <class TType, class R1, class R2, class R3,
-            utils::CheckCmp<TType, float> = true>
+  template <
+      class TType, class R1, class R2, class R3,
+      utils::CheckCmp<TType, float> = true>
  void alias_SUB(R1 r1, R2 r2, R3 r3) {
    vsubps(r1, r2, r3);
  }
-  template <class TType, class R1, class R2, class R3,
-            utils::CheckCmp<TType, double> = true>
+  template <
+      class TType, class R1, class R2, class R3,
+      utils::CheckCmp<TType, double> = true>
  void alias_SUB(R1 r1, R2 r2, R3 r3) {
    vsubpd(r1, r2, r3);
  }

-  template <class TType, class R1, class R2, class R3,
-            utils::CheckCmp<TType, float> = true>
+  template <
+      class TType, class R1, class R2, class R3,
+      utils::CheckCmp<TType, float> = true>
  void alias_DIV(R1 r1, R2 r2, R3 r3) {
    vdivps(r1, r2, r3);
  }
-  template <class TType, class R1, class R2, class R3,
-            utils::CheckCmp<TType, double> = true>
+  template <
+      class TType, class R1, class R2, class R3,
+      utils::CheckCmp<TType, double> = true>
  void alias_DIV(R1 r1, R2 r2, R3 r3) {
    vdivpd(r1, r2, r3);
  }

-  template <class TType, class R1, class R2, class R3,
-            utils::CheckCmp<TType, float> = true>
+  template <
+      class TType, class R1, class R2, class R3,
+      utils::CheckCmp<TType, float> = true>
  void alias_MUL(R1 r1, R2 r2, R3 r3) {
    vmulps(r1, r2, r3);
  }
-  template <class TType, class R1, class R2, class R3,
-            utils::CheckCmp<TType, double> = true>
+  template <
+      class TType, class R1, class R2, class R3,
+      utils::CheckCmp<TType, double> = true>
  void alias_MUL(R1 r1, R2 r2, R3 r3) {
    vmulpd(r1, r2, r3);
  }

-  template <class Operator,
-            utils::Verify<Operator, ::dgl::aten::cpu::op::CopyLhs,
-                          supported_types> = true>
+  template <
+      class Operator,
+      utils::Verify<Operator, ::dgl::aten::cpu::op::CopyLhs, supported_types> =
+          true>
  void full_chunk_loop_operations() {
    typedef typename Operator::type IType;
    alias_load<IType>(zmm0, ptr[r_out_ + r9 * sizeof(IType)]);
@@ -158,9 +168,10 @@ class ElemWiseAddUpdate : public Xbyak::CodeGenerator {
    alias_ADD<IType>(zmm2, zmm0, zmm1);
    alias_save<IType>(ptr[r_out_ + r9 * sizeof(IType)], zmm2);
  }
-  template <class Operator,
-            utils::Verify<Operator, ::dgl::aten::cpu::op::CopyRhs,
-                          supported_types> = true>
+  template <
+      class Operator,
+      utils::Verify<Operator, ::dgl::aten::cpu::op::CopyRhs, supported_types> =
+          true>
  void full_chunk_loop_operations() {
    typedef typename Operator::type IType;
    alias_load<IType>(zmm0, ptr[r_out_ + r9 * sizeof(IType)]);
@@ -179,16 +190,20 @@ class ElemWiseAddUpdate : public Xbyak::CodeGenerator {
    alias_ADD<T>(zmm2, zmm0, zmm2);
    alias_save<T>(ptr[r_out_ + r9 * sizeof(T)], zmm2);
  }
-  template <class Operator, utils::Verify<Operator, ::dgl::aten::cpu::op::Add,
-                                          supported_types> = true>
+  template <
+      class Operator,
+      utils::Verify<Operator, ::dgl::aten::cpu::op::Add, supported_types> =
+          true>
  void full_chunk_loop_operations() {
    typedef typename Operator::type IType;
    loop_pre<IType>();
    alias_ADD<IType>(zmm2, zmm1, zmm2);
    loop_post<IType>();
  }
-  template <class Operator, utils::Verify<Operator, ::dgl::aten::cpu::op::Sub,
-                                          supported_types> = true>
+  template <
+      class Operator,
+      utils::Verify<Operator, ::dgl::aten::cpu::op::Sub, supported_types> =
+          true>
  void full_chunk_loop_operations() {
    typedef typename Operator::type IType;
    loop_pre<IType>();
@@ -196,8 +211,10 @@ class ElemWiseAddUpdate : public Xbyak::CodeGenerator {
    loop_post<IType>();
  }

-  template <class Operator, utils::Verify<Operator, ::dgl::aten::cpu::op::Div,
-                                          supported_types> = true>
+  template <
+      class Operator,
+      utils::Verify<Operator, ::dgl::aten::cpu::op::Div, supported_types> =
+          true>
  void full_chunk_loop_operations() {
    typedef typename Operator::type IType;
    loop_pre<IType>();
@@ -205,8 +222,10 @@ class ElemWiseAddUpdate : public Xbyak::CodeGenerator {
    loop_post<IType>();
  }

-  template <class Operator, utils::Verify<Operator, ::dgl::aten::cpu::op::Mul,
-                                          supported_types> = true>
+  template <
+      class Operator,
+      utils::Verify<Operator, ::dgl::aten::cpu::op::Mul, supported_types> =
+          true>
  void full_chunk_loop_operations() {
    typedef typename Operator::type IType;
    loop_pre<IType>();
@@ -214,17 +233,19 @@ class ElemWiseAddUpdate : public Xbyak::CodeGenerator {
    loop_post<IType>();
  }

-  template <class Operator,
-            utils::Verify<Operator, ::dgl::aten::cpu::op::CopyLhs,
-                          supported_types> = true>
+  template <
+      class Operator,
+      utils::Verify<Operator, ::dgl::aten::cpu::op::CopyLhs, supported_types> =
+          true>
  void remainder_operations(const Xbyak::Opmask mask) {
    typedef typename Operator::type IType;
    alias_load<IType>(make_zmm(zmm2) | mask, ptr[r_left_ + r9 * sizeof(IType)]);
  }

-  template <class Operator,
-            utils::Verify<Operator, ::dgl::aten::cpu::op::CopyRhs,
-                          supported_types> = true>
+  template <
+      class Operator,
+      utils::Verify<Operator, ::dgl::aten::cpu::op::CopyRhs, supported_types> =
+          true>
  void remainder_operations(const Xbyak::Opmask mask) {
    typedef typename Operator::type IType;
    alias_load<IType>(make_zmm(zmm2) | mask, ptr[r_right + r9 * sizeof(IType)]);
@@ -236,32 +257,40 @@ class ElemWiseAddUpdate : public Xbyak::CodeGenerator {
    alias_load<T>(make_zmm(zmm1) | mask, ptr[r_right + r9 * sizeof(T)]);
  }

-  template <class Operator, utils::Verify<Operator, ::dgl::aten::cpu::op::Mul,
-                                          supported_types> = true>
+  template <
+      class Operator,
+      utils::Verify<Operator, ::dgl::aten::cpu::op::Mul, supported_types> =
+          true>
  void remainder_operations(const Xbyak::Opmask mask) {
    typedef typename Operator::type IType;
    remainder_fetch_LR<IType>(mask);
    alias_MUL<IType>(zmm2, zmm2, zmm1);
  }

-  template <class Operator, utils::Verify<Operator, ::dgl::aten::cpu::op::Add,
-                                          supported_types> = true>
+  template <
+      class Operator,
+      utils::Verify<Operator, ::dgl::aten::cpu::op::Add, supported_types> =
+          true>
  void remainder_operations(const Xbyak::Opmask mask) {
    typedef typename Operator::type IType;
    remainder_fetch_LR<IType>(mask);
    alias_ADD<DType>(zmm2, zmm2, zmm1);
  }

-  template <class Operator, utils::Verify<Operator, ::dgl::aten::cpu::op::Div,
-                                          supported_types> = true>
+  template <
+      class Operator,
+      utils::Verify<Operator, ::dgl::aten::cpu::op::Div, supported_types> =
+          true>
  void remainder_operations(const Xbyak::Opmask mask) {
    typedef typename Operator::type IType;
    remainder_fetch_LR<IType>(mask);
    alias_DIV<DType>(zmm2, zmm2, zmm1);
  }

-  template <class Operator, utils::Verify<Operator, ::dgl::aten::cpu::op::Sub,
-                                          supported_types> = true>
+  template <
+      class Operator,
+      utils::Verify<Operator, ::dgl::aten::cpu::op::Sub, supported_types> =
+          true>
  void remainder_operations(const Xbyak::Opmask mask) {
    typedef typename Operator::type IType;
    remainder_fetch_LR<IType>(mask);
@@ -280,9 +309,10 @@ class ElemWiseAddUpdate : public Xbyak::CodeGenerator {
    if (current_cpu.has(Xbyak::util::Cpu::tAVX512F)) {
      /* prepare REMAINDER */
      mov(r8, r_size_);
-      and_(r8,
-           UNIT_PER_REG - 1);  // r8_modulo = size/(sizeof(zmm)/sizeof(float))
-      xor_(r9, r9);            // reset r9
+      and_(
+          r8,
+          UNIT_PER_REG - 1);  // r8_modulo = size/(sizeof(zmm)/sizeof(float))
+      xor_(r9, r9);           // reset r9
      cmp(r_size_, UNIT_PER_REG);  // if ( size < 16 ) {  }
      jl("remainder");

@@ -306,12 +336,12 @@ class ElemWiseAddUpdate : public Xbyak::CodeGenerator {
      sal(rax, cl);
      dec(rax);        // k1= (1 << r8 )-1
      kmovw(k1, eax);  // set bitmask
-      alias_load<DType>(make_zmm(zmm0) | k1,
-                        ptr[r_out_ + r9 * UNIT_SIZE_BYTES]);
+      alias_load<DType>(
+          make_zmm(zmm0) | k1, ptr[r_out_ + r9 * UNIT_SIZE_BYTES]);
      remainder_operations<Op>(k1);
      alias_ADD<DType>(zmm3, zmm2, zmm0);
-      alias_save<DType>(ptr[r_out_ + r9 * UNIT_SIZE_BYTES],
-                        make_zmm(zmm3) | k1);
+      alias_save<DType>(
+          ptr[r_out_ + r9 * UNIT_SIZE_BYTES], make_zmm(zmm3) | k1);
      L("done");
      applicable_ = true;
      log_intel("AVX512F cpu kernel is ready");

--- a/include/intel/meta_utils.h
+++ b/include/intel/meta_utils.h
@@ -23,8 +23,9 @@ struct has_type<T, std::tuple<U, Ts...>> : has_type<T, std::tuple<Ts...>> {};
 template <typename T, typename... Ts>
 struct has_type<T, std::tuple<T, Ts...>> : std::true_type {};

-template <class OCmp, template <class> class ToP, class Tup,
-          int ok = std::tuple_size<Tup>::value>
+template <
+    class OCmp, template <class> class ToP, class Tup,
+    int ok = std::tuple_size<Tup>::value>
 struct DeepType;

 template <class OCmp, template <class> class ToP, class Tup>
@@ -38,8 +39,9 @@ struct DeepType<OCmp, ToP, Tup, 2> {
  typedef typename std::tuple_element<0, Tup>::type EL1;
  typedef typename std::tuple_element<1, Tup>::type EL2;
  enum {
-    value = (std::is_same<OCmp, ToP<EL1>>::value ||
-             std::is_same<OCmp, ToP<EL2>>::value)
+    value =
+        (std::is_same<OCmp, ToP<EL1>>::value ||
+         std::is_same<OCmp, ToP<EL2>>::value)
  };
 };

@@ -49,9 +51,10 @@ struct DeepType<OCmp, ToP, Tup, 3> {
  typedef typename std::tuple_element<1, Tup>::type EL2;
  typedef typename std::tuple_element<2, Tup>::type EL3;
  enum {
-    value = (std::is_same<OCmp, ToP<EL1>>::value ||
-             std::is_same<OCmp, ToP<EL2>>::value ||
-             std::is_same<OCmp, ToP<EL3>>::value)
+    value =
+        (std::is_same<OCmp, ToP<EL1>>::value ||
+         std::is_same<OCmp, ToP<EL2>>::value ||
+         std::is_same<OCmp, ToP<EL3>>::value)
  };
 };

@@ -63,7 +66,7 @@ using CheckCmp = Required<std::is_same<L, R>::value>;

 template <class L, class R1, class R2>
 using CheckCmp_2 =
-  Required<std::is_same<L, R1>::value || std::is_same<L, R2>::value>;
+    Required<std::is_same<L, R1>::value || std::is_same<L, R2>::value>;

 template <class OpType, template <class> class TPP, class Tup>
 using Verify = Required<utils::DeepType<OpType, TPP, Tup>::value>;

--- a/src/api/api_container.cc
+++ b/src/api/api_container.cc
@@ -3,73 +3,68 @@
 * \file api/api_container.cc
 * \brief Runtime container APIs. (reference: tvm/src/api/api_lang.cc)
 */
-#include <dgl/runtime/ndarray.h>
+#include <dgl/packed_func_ext.h>
 #include <dgl/runtime/container.h>
+#include <dgl/runtime/ndarray.h>
 #include <dgl/runtime/registry.h>
-#include <dgl/packed_func_ext.h>

 namespace dgl {
 namespace runtime {

-DGL_REGISTER_GLOBAL("_List")
-.set_body([] (DGLArgs args, DGLRetValue* rv) {
-    auto ret_obj = std::make_shared<runtime::ListObject>();
-    for (int i = 0; i < args.size(); ++i) {
-      ret_obj->data.push_back(args[i].obj_sptr());
-    }
-    *rv = ret_obj;
-  });
+DGL_REGISTER_GLOBAL("_List").set_body([](DGLArgs args, DGLRetValue* rv) {
+  auto ret_obj = std::make_shared<runtime::ListObject>();
+  for (int i = 0; i < args.size(); ++i) {
+    ret_obj->data.push_back(args[i].obj_sptr());
+  }
+  *rv = ret_obj;
+});

-DGL_REGISTER_GLOBAL("_ListGetItem")
-.set_body([] (DGLArgs args, DGLRetValue* rv) {
-    auto& sptr = args[0].obj_sptr();
-    CHECK(sptr->is_type<ListObject>());
-    auto* o = static_cast<const ListObject*>(sptr.get());
-    int64_t i = args[1];
-    CHECK_LT(i, o->data.size()) << "list out of bound";
-    *rv = o->data[i];
-  });
+DGL_REGISTER_GLOBAL("_ListGetItem").set_body([](DGLArgs args, DGLRetValue* rv) {
+  auto& sptr = args[0].obj_sptr();
+  CHECK(sptr->is_type<ListObject>());
+  auto* o = static_cast<const ListObject*>(sptr.get());
+  int64_t i = args[1];
+  CHECK_LT(i, o->data.size()) << "list out of bound";
+  *rv = o->data[i];
+});

-DGL_REGISTER_GLOBAL("_ListSize")
-.set_body([] (DGLArgs args, DGLRetValue* rv) {
-    auto& sptr = args[0].obj_sptr();
-    CHECK(sptr->is_type<ListObject>());
-    auto* o = static_cast<const ListObject*>(sptr.get());
-    *rv = static_cast<int64_t>(o->data.size());
-  });
+DGL_REGISTER_GLOBAL("_ListSize").set_body([](DGLArgs args, DGLRetValue* rv) {
+  auto& sptr = args[0].obj_sptr();
+  CHECK(sptr->is_type<ListObject>());
+  auto* o = static_cast<const ListObject*>(sptr.get());
+  *rv = static_cast<int64_t>(o->data.size());
+});

-DGL_REGISTER_GLOBAL("_Map")
-.set_body([] (DGLArgs args, DGLRetValue* rv) {
-    CHECK_EQ(args.size() % 2, 0);
-    if (args.size() != 0 && args[0].type_code() == kStr) {
-      // StrMap
-      StrMapObject::ContainerType data;
-      for (int i = 0; i < args.size(); i += 2) {
-        CHECK(args[i].type_code() == kStr)
-          << "The key of the map must be string";
-        CHECK(args[i + 1].type_code() == kObjectHandle)
+DGL_REGISTER_GLOBAL("_Map").set_body([](DGLArgs args, DGLRetValue* rv) {
+  CHECK_EQ(args.size() % 2, 0);
+  if (args.size() != 0 && args[0].type_code() == kStr) {
+    // StrMap
+    StrMapObject::ContainerType data;
+    for (int i = 0; i < args.size(); i += 2) {
+      CHECK(args[i].type_code() == kStr) << "The key of the map must be string";
+      CHECK(args[i + 1].type_code() == kObjectHandle)
          << "The value of the map must be an object type";
-        data.emplace(std::make_pair(args[i].operator std::string(),
-                                    args[i + 1].obj_sptr()));
-      }
-      auto obj = std::make_shared<StrMapObject>();
-      obj->data = std::move(data);
-      *rv = obj;
-    } else {
-      // object container
-      MapObject::ContainerType data;
-      for (int i = 0; i < args.size(); i += 2) {
-        CHECK(args[i].type_code() == kObjectHandle)
+      data.emplace(std::make_pair(
+          args[i].operator std::string(), args[i + 1].obj_sptr()));
+    }
+    auto obj = std::make_shared<StrMapObject>();
+    obj->data = std::move(data);
+    *rv = obj;
+  } else {
+    // object container
+    MapObject::ContainerType data;
+    for (int i = 0; i < args.size(); i += 2) {
+      CHECK(args[i].type_code() == kObjectHandle)
          << "The key of the map must be an object type";
-        CHECK(args[i + 1].type_code() == kObjectHandle)
+      CHECK(args[i + 1].type_code() == kObjectHandle)
          << "The value of the map must be an object type";
-        data.emplace(std::make_pair(args[i].obj_sptr(), args[i + 1].obj_sptr()));
-      }
-      auto obj = std::make_shared<MapObject>();
-      obj->data = std::move(data);
-      *rv = obj;
+      data.emplace(std::make_pair(args[i].obj_sptr(), args[i + 1].obj_sptr()));
    }
-  });
+    auto obj = std::make_shared<MapObject>();
+    obj->data = std::move(data);
+    *rv = obj;
+  }
+});

 DGL_REGISTER_GLOBAL("_EmptyStrMap").set_body([](DGLArgs args, DGLRetValue* rv) {
  StrMapObject::ContainerType data;
@@ -78,84 +73,78 @@ DGL_REGISTER_GLOBAL("_EmptyStrMap").set_body([](DGLArgs args, DGLRetValue* rv) {
  *rv = obj;
 });

-DGL_REGISTER_GLOBAL("_MapSize")
-.set_body([] (DGLArgs args, DGLRetValue* rv) {
-    auto& sptr = args[0].obj_sptr();
-    if (sptr->is_type<MapObject>()) {
-      auto* o = static_cast<const MapObject*>(sptr.get());
-      *rv = static_cast<int64_t>(o->data.size());
-    } else {
-      CHECK(sptr->is_type<StrMapObject>());
-      auto* o = static_cast<const StrMapObject*>(sptr.get());
-      *rv = static_cast<int64_t>(o->data.size());
-    }
-  });
+DGL_REGISTER_GLOBAL("_MapSize").set_body([](DGLArgs args, DGLRetValue* rv) {
+  auto& sptr = args[0].obj_sptr();
+  if (sptr->is_type<MapObject>()) {
+    auto* o = static_cast<const MapObject*>(sptr.get());
+    *rv = static_cast<int64_t>(o->data.size());
+  } else {
+    CHECK(sptr->is_type<StrMapObject>());
+    auto* o = static_cast<const StrMapObject*>(sptr.get());
+    *rv = static_cast<int64_t>(o->data.size());
+  }
+});

-DGL_REGISTER_GLOBAL("_MapGetItem")
-.set_body([] (DGLArgs args, DGLRetValue* rv) {
-    auto& sptr = args[0].obj_sptr();
-    if (sptr->is_type<MapObject>()) {
-      auto* o = static_cast<const MapObject*>(sptr.get());
-      auto it = o->data.find(args[1].obj_sptr());
-      CHECK(it != o->data.end()) << "cannot find the key in the map";
-      *rv = (*it).second;
-    } else {
-      CHECK(sptr->is_type<StrMapObject>());
-      auto* o = static_cast<const StrMapObject*>(sptr.get());
-      auto it = o->data.find(args[1].operator std::string());
-      CHECK(it != o->data.end()) << "cannot find the key in the map";
-      *rv = (*it).second;
-    }
-  });
+DGL_REGISTER_GLOBAL("_MapGetItem").set_body([](DGLArgs args, DGLRetValue* rv) {
+  auto& sptr = args[0].obj_sptr();
+  if (sptr->is_type<MapObject>()) {
+    auto* o = static_cast<const MapObject*>(sptr.get());
+    auto it = o->data.find(args[1].obj_sptr());
+    CHECK(it != o->data.end()) << "cannot find the key in the map";
+    *rv = (*it).second;
+  } else {
+    CHECK(sptr->is_type<StrMapObject>());
+    auto* o = static_cast<const StrMapObject*>(sptr.get());
+    auto it = o->data.find(args[1].operator std::string());
+    CHECK(it != o->data.end()) << "cannot find the key in the map";
+    *rv = (*it).second;
+  }
+});

-DGL_REGISTER_GLOBAL("_MapItems")
-.set_body([] (DGLArgs args, DGLRetValue* rv) {
-    auto& sptr = args[0].obj_sptr();
-    if (sptr->is_type<MapObject>()) {
-      auto* o = static_cast<const MapObject*>(sptr.get());
-      auto rkvs = std::make_shared<ListObject>();
-      for (const auto& kv : o->data) {
-        rkvs->data.push_back(kv.first);
-        rkvs->data.push_back(kv.second);
-      }
-      *rv = rkvs;
-    } else {
-      CHECK(sptr->is_type<StrMapObject>());
-      auto* o = static_cast<const StrMapObject*>(sptr.get());
-      auto rkvs = std::make_shared<ListObject>();
-      for (const auto& kv : o->data) {
-        rkvs->data.push_back(MakeValue(kv.first));
-        rkvs->data.push_back(kv.second);
-      }
-      *rv = rkvs;
+DGL_REGISTER_GLOBAL("_MapItems").set_body([](DGLArgs args, DGLRetValue* rv) {
+  auto& sptr = args[0].obj_sptr();
+  if (sptr->is_type<MapObject>()) {
+    auto* o = static_cast<const MapObject*>(sptr.get());
+    auto rkvs = std::make_shared<ListObject>();
+    for (const auto& kv : o->data) {
+      rkvs->data.push_back(kv.first);
+      rkvs->data.push_back(kv.second);
    }
-  });
-
-DGL_REGISTER_GLOBAL("_MapCount")
-.set_body([] (DGLArgs args, DGLRetValue* rv) {
-    auto& sptr = args[0].obj_sptr();
-    if (sptr->is_type<MapObject>()) {
-      auto* o = static_cast<const MapObject*>(sptr.get());
-      *rv = static_cast<int64_t>(o->data.count(args[1].obj_sptr()));
-    } else {
-      CHECK(sptr->is_type<StrMapObject>());
-      auto* o = static_cast<const StrMapObject*>(sptr.get());
-      *rv = static_cast<int64_t>(o->data.count(args[1].operator std::string()));
+    *rv = rkvs;
+  } else {
+    CHECK(sptr->is_type<StrMapObject>());
+    auto* o = static_cast<const StrMapObject*>(sptr.get());
+    auto rkvs = std::make_shared<ListObject>();
+    for (const auto& kv : o->data) {
+      rkvs->data.push_back(MakeValue(kv.first));
+      rkvs->data.push_back(kv.second);
    }
-  });
+    *rv = rkvs;
+  }
+});

-DGL_REGISTER_GLOBAL("_Value")
-.set_body([] (DGLArgs args, DGLRetValue* rv) {
-    *rv = MakeValue(args[0]);
-  });
+DGL_REGISTER_GLOBAL("_MapCount").set_body([](DGLArgs args, DGLRetValue* rv) {
+  auto& sptr = args[0].obj_sptr();
+  if (sptr->is_type<MapObject>()) {
+    auto* o = static_cast<const MapObject*>(sptr.get());
+    *rv = static_cast<int64_t>(o->data.count(args[1].obj_sptr()));
+  } else {
+    CHECK(sptr->is_type<StrMapObject>());
+    auto* o = static_cast<const StrMapObject*>(sptr.get());
+    *rv = static_cast<int64_t>(o->data.count(args[1].operator std::string()));
+  }
+});

-DGL_REGISTER_GLOBAL("_ValueGet")
-.set_body([] (DGLArgs args, DGLRetValue* rv) {
-    auto& sptr = args[0].obj_sptr();
-    CHECK(sptr->is_type<ValueObject>());
-    auto* o = static_cast<const ValueObject*>(sptr.get());
-    *rv = o->data;
-  });
+DGL_REGISTER_GLOBAL("_Value").set_body([](DGLArgs args, DGLRetValue* rv) {
+  *rv = MakeValue(args[0]);
+});
+
+DGL_REGISTER_GLOBAL("_ValueGet").set_body([](DGLArgs args, DGLRetValue* rv) {
+  auto& sptr = args[0].obj_sptr();
+  CHECK(sptr->is_type<ValueObject>());
+  auto* o = static_cast<const ValueObject*>(sptr.get());
+  *rv = o->data;
+});

 }  // namespace runtime
 }  // namespace dgl
--- a/src/api/api_test.cc
+++ b/src/api/api_test.cc
@@ -3,10 +3,11 @@
 * \file api/api_test.cc
 * \brief C APIs for testing FFI
 */
-#include <dgl/runtime/ndarray.h>
+#include <dgl/packed_func_ext.h>
 #include <dgl/runtime/container.h>
+#include <dgl/runtime/ndarray.h>
 #include <dgl/runtime/registry.h>
-#include <dgl/packed_func_ext.h>
+
 #include <thread>

 namespace dgl {
@@ -18,12 +19,12 @@ namespace runtime {
 //   - The argument to pass to the python callback
 // It returns what python callback returns
 DGL_REGISTER_GLOBAL("_TestPythonCallback")
-.set_body([](DGLArgs args, DGLRetValue* rv) {
-  LOG(INFO) << "Inside C API";
-  PackedFunc fn = args[0];
-  DGLArgs cb_args(args.values + 1, args.type_codes + 1, 1);
-  fn.CallPacked(cb_args, rv);
-});
+    .set_body([](DGLArgs args, DGLRetValue* rv) {
+      LOG(INFO) << "Inside C API";
+      PackedFunc fn = args[0];
+      DGLArgs cb_args(args.values + 1, args.type_codes + 1, 1);
+      fn.CallPacked(cb_args, rv);
+    });

 // Register an internal API for testing python callback.
 // It receives two arguments:
@@ -34,17 +35,16 @@ DGL_REGISTER_GLOBAL("_TestPythonCallback")
 // The API runs the python callback in a separate thread to test
 // python GIL is properly released.
 DGL_REGISTER_GLOBAL("_TestPythonCallbackThread")
-.set_body([](DGLArgs args, DGLRetValue* rv) {
-  LOG(INFO) << "Inside C API";
-  PackedFunc fn = args[0];
-  auto thr = std::make_shared<std::thread>(
-      [fn, args, rv]() {
+    .set_body([](DGLArgs args, DGLRetValue* rv) {
+      LOG(INFO) << "Inside C API";
+      PackedFunc fn = args[0];
+      auto thr = std::make_shared<std::thread>([fn, args, rv]() {
        LOG(INFO) << "Callback thread " << std::this_thread::get_id();
        DGLArgs cb_args(args.values + 1, args.type_codes + 1, 1);
        fn.CallPacked(cb_args, rv);
      });
-  thr->join();
-});
+      thr->join();
+    });

 }  // namespace runtime
 }  // namespace dgl
--- a/src/array/array_arith.cc
+++ b/src/array/array_arith.cc
@@ -4,11 +4,12 @@
 * \brief DGL array arithmetic operations
 */
 #include <dgl/packed_func_ext.h>
-#include <dgl/runtime/ndarray.h>
 #include <dgl/runtime/container.h>
+#include <dgl/runtime/ndarray.h>
+
 #include "../c_api_common.h"
-#include "./array_op.h"
 #include "./arith.h"
+#include "./array_op.h"

 using namespace dgl::runtime;

@@ -16,56 +17,55 @@ namespace dgl {
 namespace aten {

 // Generate operators with both operations being NDArrays.
-#define BINARY_ELEMENT_OP(name, op)                                      \
-  IdArray name(IdArray lhs, IdArray rhs) {                               \
-    IdArray ret;                                                         \
-    CHECK_SAME_DTYPE(lhs, rhs);                                          \
-    CHECK_SAME_CONTEXT(lhs, rhs);                                        \
-    ATEN_XPU_SWITCH_CUDA(lhs->ctx.device_type, XPU, #name, {             \
-      ATEN_ID_TYPE_SWITCH(lhs->dtype, IdType, {                          \
-        ret = impl::BinaryElewise<XPU, IdType, arith::op>(lhs, rhs);     \
-      });                                                                \
-    });                                                                  \
-    return ret;                                                          \
+#define BINARY_ELEMENT_OP(name, op)                                  \
+  IdArray name(IdArray lhs, IdArray rhs) {                           \
+    IdArray ret;                                                     \
+    CHECK_SAME_DTYPE(lhs, rhs);                                      \
+    CHECK_SAME_CONTEXT(lhs, rhs);                                    \
+    ATEN_XPU_SWITCH_CUDA(lhs->ctx.device_type, XPU, #name, {         \
+      ATEN_ID_TYPE_SWITCH(lhs->dtype, IdType, {                      \
+        ret = impl::BinaryElewise<XPU, IdType, arith::op>(lhs, rhs); \
+      });                                                            \
+    });                                                              \
+    return ret;                                                      \
  }

 // Generate operators with only lhs being NDArray.
-#define BINARY_ELEMENT_OP_L(name, op)                                    \
-  IdArray name(IdArray lhs, int64_t rhs) {                               \
-    IdArray ret;                                                         \
-    ATEN_XPU_SWITCH_CUDA(lhs->ctx.device_type, XPU, #name, {             \
-      ATEN_ID_TYPE_SWITCH(lhs->dtype, IdType, {                          \
-        ret = impl::BinaryElewise<XPU, IdType, arith::op>(lhs, rhs);     \
-      });                                                                \
-    });                                                                  \
-    return ret;                                                          \
+#define BINARY_ELEMENT_OP_L(name, op)                                \
+  IdArray name(IdArray lhs, int64_t rhs) {                           \
+    IdArray ret;                                                     \
+    ATEN_XPU_SWITCH_CUDA(lhs->ctx.device_type, XPU, #name, {         \
+      ATEN_ID_TYPE_SWITCH(lhs->dtype, IdType, {                      \
+        ret = impl::BinaryElewise<XPU, IdType, arith::op>(lhs, rhs); \
+      });                                                            \
+    });                                                              \
+    return ret;                                                      \
  }

 // Generate operators with only lhs being NDArray.
-#define BINARY_ELEMENT_OP_R(name, op)                                    \
-  IdArray name(int64_t lhs, IdArray rhs) {                               \
-    IdArray ret;                                                         \
-    ATEN_XPU_SWITCH_CUDA(rhs->ctx.device_type, XPU, #name, {             \
-      ATEN_ID_TYPE_SWITCH(rhs->dtype, IdType, {                          \
-        ret = impl::BinaryElewise<XPU, IdType, arith::op>(lhs, rhs);     \
-      });                                                                \
-    });                                                                  \
-    return ret;                                                          \
+#define BINARY_ELEMENT_OP_R(name, op)                                \
+  IdArray name(int64_t lhs, IdArray rhs) {                           \
+    IdArray ret;                                                     \
+    ATEN_XPU_SWITCH_CUDA(rhs->ctx.device_type, XPU, #name, {         \
+      ATEN_ID_TYPE_SWITCH(rhs->dtype, IdType, {                      \
+        ret = impl::BinaryElewise<XPU, IdType, arith::op>(lhs, rhs); \
+      });                                                            \
+    });                                                              \
+    return ret;                                                      \
  }

 // Generate operators with only lhs being NDArray.
-#define UNARY_ELEMENT_OP(name, op)                                       \
-  IdArray name(IdArray lhs) {                                            \
-    IdArray ret;                                                         \
-    ATEN_XPU_SWITCH_CUDA(lhs->ctx.device_type, XPU, #name, {             \
-      ATEN_ID_TYPE_SWITCH(lhs->dtype, IdType, {                          \
-        ret = impl::UnaryElewise<XPU, IdType, arith::op>(lhs);           \
-      });                                                                \
-    });                                                                  \
-    return ret;                                                          \
+#define UNARY_ELEMENT_OP(name, op)                             \
+  IdArray name(IdArray lhs) {                                  \
+    IdArray ret;                                               \
+    ATEN_XPU_SWITCH_CUDA(lhs->ctx.device_type, XPU, #name, {   \
+      ATEN_ID_TYPE_SWITCH(lhs->dtype, IdType, {                \
+        ret = impl::UnaryElewise<XPU, IdType, arith::op>(lhs); \
+      });                                                      \
+    });                                                        \
+    return ret;                                                \
  }

-
 BINARY_ELEMENT_OP(Add, Add)
 BINARY_ELEMENT_OP(Sub, Sub)
 BINARY_ELEMENT_OP(Mul, Mul)
@@ -108,106 +108,104 @@ UNARY_ELEMENT_OP(Neg, Neg)
 }  // namespace dgl

 ///////////////// Operator overloading for NDArray /////////////////
-NDArray operator + (const NDArray& lhs, const NDArray& rhs) {
+NDArray operator+(const NDArray& lhs, const NDArray& rhs) {
  return dgl::aten::Add(lhs, rhs);
 }
-NDArray operator - (const NDArray& lhs, const NDArray& rhs) {
+NDArray operator-(const NDArray& lhs, const NDArray& rhs) {
  return dgl::aten::Sub(lhs, rhs);
 }
-NDArray operator * (const NDArray& lhs, const NDArray& rhs) {
+NDArray operator*(const NDArray& lhs, const NDArray& rhs) {
  return dgl::aten::Mul(lhs, rhs);
 }
-NDArray operator / (const NDArray& lhs, const NDArray& rhs) {
+NDArray operator/(const NDArray& lhs, const NDArray& rhs) {
  return dgl::aten::Div(lhs, rhs);
 }
-NDArray operator % (const NDArray& lhs, const NDArray& rhs) {
+NDArray operator%(const NDArray& lhs, const NDArray& rhs) {
  return dgl::aten::Mod(lhs, rhs);
 }
-NDArray operator + (const NDArray& lhs, int64_t rhs) {
+NDArray operator+(const NDArray& lhs, int64_t rhs) {
  return dgl::aten::Add(lhs, rhs);
 }
-NDArray operator - (const NDArray& lhs, int64_t rhs) {
+NDArray operator-(const NDArray& lhs, int64_t rhs) {
  return dgl::aten::Sub(lhs, rhs);
 }
-NDArray operator * (const NDArray& lhs, int64_t rhs) {
+NDArray operator*(const NDArray& lhs, int64_t rhs) {
  return dgl::aten::Mul(lhs, rhs);
 }
-NDArray operator / (const NDArray& lhs, int64_t rhs) {
+NDArray operator/(const NDArray& lhs, int64_t rhs) {
  return dgl::aten::Div(lhs, rhs);
 }
-NDArray operator % (const NDArray& lhs, int64_t rhs) {
+NDArray operator%(const NDArray& lhs, int64_t rhs) {
  return dgl::aten::Mod(lhs, rhs);
 }
-NDArray operator + (int64_t lhs, const NDArray& rhs) {
+NDArray operator+(int64_t lhs, const NDArray& rhs) {
  return dgl::aten::Add(lhs, rhs);
 }
-NDArray operator - (int64_t lhs, const NDArray& rhs) {
+NDArray operator-(int64_t lhs, const NDArray& rhs) {
  return dgl::aten::Sub(lhs, rhs);
 }
-NDArray operator * (int64_t lhs, const NDArray& rhs) {
+NDArray operator*(int64_t lhs, const NDArray& rhs) {
  return dgl::aten::Mul(lhs, rhs);
 }
-NDArray operator / (int64_t lhs, const NDArray& rhs) {
+NDArray operator/(int64_t lhs, const NDArray& rhs) {
  return dgl::aten::Div(lhs, rhs);
 }
-NDArray operator % (int64_t lhs, const NDArray& rhs) {
+NDArray operator%(int64_t lhs, const NDArray& rhs) {
  return dgl::aten::Mod(lhs, rhs);
 }
-NDArray operator - (const NDArray& array) {
-  return dgl::aten::Neg(array);
-}
+NDArray operator-(const NDArray& array) { return dgl::aten::Neg(array); }

-NDArray operator > (const NDArray& lhs, const NDArray& rhs) {
+NDArray operator>(const NDArray& lhs, const NDArray& rhs) {
  return dgl::aten::GT(lhs, rhs);
 }
-NDArray operator < (const NDArray& lhs, const NDArray& rhs) {
+NDArray operator<(const NDArray& lhs, const NDArray& rhs) {
  return dgl::aten::LT(lhs, rhs);
 }
-NDArray operator >= (const NDArray& lhs, const NDArray& rhs) {
+NDArray operator>=(const NDArray& lhs, const NDArray& rhs) {
  return dgl::aten::GE(lhs, rhs);
 }
-NDArray operator <= (const NDArray& lhs, const NDArray& rhs) {
+NDArray operator<=(const NDArray& lhs, const NDArray& rhs) {
  return dgl::aten::LE(lhs, rhs);
 }
-NDArray operator == (const NDArray& lhs, const NDArray& rhs) {
+NDArray operator==(const NDArray& lhs, const NDArray& rhs) {
  return dgl::aten::EQ(lhs, rhs);
 }
-NDArray operator != (const NDArray& lhs, const NDArray& rhs) {
+NDArray operator!=(const NDArray& lhs, const NDArray& rhs) {
  return dgl::aten::NE(lhs, rhs);
 }
-NDArray operator > (const NDArray& lhs, int64_t rhs) {
+NDArray operator>(const NDArray& lhs, int64_t rhs) {
  return dgl::aten::GT(lhs, rhs);
 }
-NDArray operator < (const NDArray& lhs, int64_t rhs) {
+NDArray operator<(const NDArray& lhs, int64_t rhs) {
  return dgl::aten::LT(lhs, rhs);
 }
-NDArray operator >= (const NDArray& lhs, int64_t rhs) {
+NDArray operator>=(const NDArray& lhs, int64_t rhs) {
  return dgl::aten::GE(lhs, rhs);
 }
-NDArray operator <= (const NDArray& lhs, int64_t rhs) {
+NDArray operator<=(const NDArray& lhs, int64_t rhs) {
  return dgl::aten::LE(lhs, rhs);
 }
-NDArray operator == (const NDArray& lhs, int64_t rhs) {
+NDArray operator==(const NDArray& lhs, int64_t rhs) {
  return dgl::aten::EQ(lhs, rhs);
 }
-NDArray operator != (const NDArray& lhs, int64_t rhs) {
+NDArray operator!=(const NDArray& lhs, int64_t rhs) {
  return dgl::aten::NE(lhs, rhs);
 }
-NDArray operator > (int64_t lhs, const NDArray& rhs) {
+NDArray operator>(int64_t lhs, const NDArray& rhs) {
  return dgl::aten::GT(lhs, rhs);
 }
-NDArray operator < (int64_t lhs, const NDArray& rhs) {
+NDArray operator<(int64_t lhs, const NDArray& rhs) {
  return dgl::aten::LT(lhs, rhs);
 }
-NDArray operator >= (int64_t lhs, const NDArray& rhs) {
+NDArray operator>=(int64_t lhs, const NDArray& rhs) {
  return dgl::aten::GE(lhs, rhs);
 }
-NDArray operator <= (int64_t lhs, const NDArray& rhs) {
+NDArray operator<=(int64_t lhs, const NDArray& rhs) {
  return dgl::aten::LE(lhs, rhs);
 }
-NDArray operator == (int64_t lhs, const NDArray& rhs) {
+NDArray operator==(int64_t lhs, const NDArray& rhs) {
  return dgl::aten::EQ(lhs, rhs);
 }
-NDArray operator != (int64_t lhs, const NDArray& rhs) {
+NDArray operator!=(int64_t lhs, const NDArray& rhs) {
  return dgl::aten::NE(lhs, rhs);
 }
--- a/src/array/check.h
+++ b/src/array/check.h
@@ -6,58 +6,52 @@
 #ifndef DGL_ARRAY_CHECK_H_
 #define DGL_ARRAY_CHECK_H_

-#include <dgl/runtime/ndarray.h>
 #include <dgl/array.h>
-#include <vector>
+#include <dgl/runtime/ndarray.h>
+
 #include <string>
+#include <vector>

 namespace dgl {
 namespace aten {

 // Check whether the given arguments have the same context.
 inline void CheckCtx(
-    const DGLContext& ctx,
-    const std::vector<NDArray>& arrays,
+    const DGLContext& ctx, const std::vector<NDArray>& arrays,
    const std::vector<std::string>& names) {
  for (size_t i = 0; i < arrays.size(); ++i) {
-    if (IsNullArray(arrays[i]))
-      continue;
+    if (IsNullArray(arrays[i])) continue;
    CHECK_EQ(ctx, arrays[i]->ctx)
-      << "Expected device context " << ctx << ". But got "
-      << arrays[i]->ctx << " for " << names[i] << ".";
+        << "Expected device context " << ctx << ". But got " << arrays[i]->ctx
+        << " for " << names[i] << ".";
  }
 }

 // Check whether input tensors are contiguous.
 inline void CheckContiguous(
-    const std::vector<NDArray>& arrays,
-    const std::vector<std::string>& names) {
+    const std::vector<NDArray>& arrays, const std::vector<std::string>& names) {
  for (size_t i = 0; i < arrays.size(); ++i) {
-    if (IsNullArray(arrays[i]))
-      continue;
+    if (IsNullArray(arrays[i])) continue;
    CHECK(arrays[i].IsContiguous())
-      << "Expect " << names[i] << " to be a contiguous tensor";
+        << "Expect " << names[i] << " to be a contiguous tensor";
  }
 }

 // Check whether input tensors have valid shape.
 inline void CheckShape(
-    const std::vector<uint64_t>& gdim,
-    const std::vector<int>& uev_idx,
-    const std::vector<NDArray>& arrays,
-    const std::vector<std::string>& names) {
+    const std::vector<uint64_t>& gdim, const std::vector<int>& uev_idx,
+    const std::vector<NDArray>& arrays, const std::vector<std::string>& names) {
  for (size_t i = 0; i < arrays.size(); ++i) {
-    if (IsNullArray(arrays[i]))
-      continue;
+    if (IsNullArray(arrays[i])) continue;
    CHECK_GE(arrays[i]->ndim, 2)
-      << "Expect " << names[i] << " to have ndim >= 2, "
-      << "Note that for scalar feature we expand its "
-      << "dimension with an additional dimension of "
-      << "length one.";
+        << "Expect " << names[i] << " to have ndim >= 2, "
+        << "Note that for scalar feature we expand its "
+        << "dimension with an additional dimension of "
+        << "length one.";
    CHECK_EQ(gdim[uev_idx[i]], arrays[i]->shape[0])
-      << "Expect " << names[i] << " to have size "
-      << gdim[uev_idx[i]] << " on the first dimension, "
-      << "but got " << arrays[i]->shape[0];
+        << "Expect " << names[i] << " to have size " << gdim[uev_idx[i]]
+        << " on the first dimension, "
+        << "but got " << arrays[i]->shape[0];
  }
 }


--- a/src/array/cpu/array_cumsum.cc
+++ b/src/array/cpu/array_cumsum.cc
@@ -14,22 +14,21 @@ template <DGLDeviceType XPU, typename IdType>
 IdArray CumSum(IdArray array, bool prepend_zero) {
  const int64_t len = array.NumElements();
  if (len == 0)
-    return !prepend_zero ? array : aten::Full(0, 1, array->dtype.bits, array->ctx);
+    return !prepend_zero ? array
+                         : aten::Full(0, 1, array->dtype.bits, array->ctx);
  if (prepend_zero) {
    IdArray ret = aten::NewIdArray(len + 1, array->ctx, array->dtype.bits);
    const IdType* in_d = array.Ptr<IdType>();
    IdType* out_d = ret.Ptr<IdType>();
    out_d[0] = 0;
-    for (int64_t i = 0; i < len; ++i)
-      out_d[i + 1] = out_d[i] + in_d[i];
+    for (int64_t i = 0; i < len; ++i) out_d[i + 1] = out_d[i] + in_d[i];
    return ret;
  } else {
    IdArray ret = aten::NewIdArray(len, array->ctx, array->dtype.bits);
    const IdType* in_d = array.Ptr<IdType>();
    IdType* out_d = ret.Ptr<IdType>();
    out_d[0] = in_d[0];
-    for (int64_t i = 1; i < len; ++i)
-      out_d[i] = out_d[i - 1] + in_d[i];
+    for (int64_t i = 1; i < len; ++i) out_d[i] = out_d[i - 1] + in_d[i];
    return ret;
  }
 }

--- a/src/array/cpu/array_index_select.cc
+++ b/src/array/cpu/array_index_select.cc
@@ -10,10 +10,11 @@ using runtime::NDArray;
 namespace aten {
 namespace impl {

-template<DGLDeviceType XPU, typename DType, typename IdType>
+template <DGLDeviceType XPU, typename DType, typename IdType>
 NDArray IndexSelect(NDArray array, IdArray index) {
-  CHECK_EQ(array->shape[0], array.NumElements()) << "Only support tensor"
-    << " whose first dimension equals number of elements, e.g. (5,), (5, 1)";
+  CHECK_EQ(array->shape[0], array.NumElements())
+      << "Only support tensor"
+      << " whose first dimension equals number of elements, e.g. (5,), (5, 1)";

  const DType* array_data = static_cast<DType*>(array->data);
  const IdType* idx_data = static_cast<IdType*>(index->data);

--- a/src/array/cpu/array_nonzero.cc
+++ b/src/array/cpu/array_nonzero.cc
@@ -15,8 +15,7 @@ IdArray NonZero(IdArray array) {
  std::vector<int64_t> ret;
  const IdType* data = array.Ptr<IdType>();
  for (int64_t i = 0; i < array->shape[0]; ++i)
-    if (data[i] != 0)
-      ret.push_back(i);
+    if (data[i] != 0) ret.push_back(i);
  return NDArray::FromVector(ret, array->ctx);
 }


--- a/src/array/cpu/array_op_impl.cc
+++ b/src/array/cpu/array_op_impl.cc
@@ -6,7 +6,9 @@
 #include <dgl/array.h>
 #include <dgl/runtime/ndarray.h>
 #include <dgl/runtime/parallel_for.h>
+
 #include <numeric>
+
 #include "../arith.h"

 namespace dgl {
@@ -51,116 +53,186 @@ IdArray BinaryElewise(IdArray lhs, IdArray rhs) {
  const IdType* lhs_data = static_cast<IdType*>(lhs->data);
  const IdType* rhs_data = static_cast<IdType*>(rhs->data);
  IdType* ret_data = static_cast<IdType*>(ret->data);
-  // TODO(BarclayII): this usually incurs lots of overhead in thread spawning, scheduling,
-  // etc., especially since the workload is very light.  Need to replace with parallel_for.
+  // TODO(BarclayII): this usually incurs lots of overhead in thread spawning,
+  // scheduling, etc., especially since the workload is very light.  Need to
+  // replace with parallel_for.
  for (int64_t i = 0; i < lhs->shape[0]; i++) {
    ret_data[i] = Op::Call(lhs_data[i], rhs_data[i]);
  }
  return ret;
 }

-template IdArray BinaryElewise<kDGLCPU, int32_t, arith::Add>(IdArray lhs, IdArray rhs);
-template IdArray BinaryElewise<kDGLCPU, int32_t, arith::Sub>(IdArray lhs, IdArray rhs);
-template IdArray BinaryElewise<kDGLCPU, int32_t, arith::Mul>(IdArray lhs, IdArray rhs);
-template IdArray BinaryElewise<kDGLCPU, int32_t, arith::Div>(IdArray lhs, IdArray rhs);
-template IdArray BinaryElewise<kDGLCPU, int32_t, arith::Mod>(IdArray lhs, IdArray rhs);
-template IdArray BinaryElewise<kDGLCPU, int32_t, arith::GT>(IdArray lhs, IdArray rhs);
-template IdArray BinaryElewise<kDGLCPU, int32_t, arith::LT>(IdArray lhs, IdArray rhs);
-template IdArray BinaryElewise<kDGLCPU, int32_t, arith::GE>(IdArray lhs, IdArray rhs);
-template IdArray BinaryElewise<kDGLCPU, int32_t, arith::LE>(IdArray lhs, IdArray rhs);
-template IdArray BinaryElewise<kDGLCPU, int32_t, arith::EQ>(IdArray lhs, IdArray rhs);
-template IdArray BinaryElewise<kDGLCPU, int32_t, arith::NE>(IdArray lhs, IdArray rhs);
-template IdArray BinaryElewise<kDGLCPU, int64_t, arith::Add>(IdArray lhs, IdArray rhs);
-template IdArray BinaryElewise<kDGLCPU, int64_t, arith::Sub>(IdArray lhs, IdArray rhs);
-template IdArray BinaryElewise<kDGLCPU, int64_t, arith::Mul>(IdArray lhs, IdArray rhs);
-template IdArray BinaryElewise<kDGLCPU, int64_t, arith::Div>(IdArray lhs, IdArray rhs);
-template IdArray BinaryElewise<kDGLCPU, int64_t, arith::Mod>(IdArray lhs, IdArray rhs);
-template IdArray BinaryElewise<kDGLCPU, int64_t, arith::GT>(IdArray lhs, IdArray rhs);
-template IdArray BinaryElewise<kDGLCPU, int64_t, arith::LT>(IdArray lhs, IdArray rhs);
-template IdArray BinaryElewise<kDGLCPU, int64_t, arith::GE>(IdArray lhs, IdArray rhs);
-template IdArray BinaryElewise<kDGLCPU, int64_t, arith::LE>(IdArray lhs, IdArray rhs);
-template IdArray BinaryElewise<kDGLCPU, int64_t, arith::EQ>(IdArray lhs, IdArray rhs);
-template IdArray BinaryElewise<kDGLCPU, int64_t, arith::NE>(IdArray lhs, IdArray rhs);
+template IdArray BinaryElewise<kDGLCPU, int32_t, arith::Add>(
+    IdArray lhs, IdArray rhs);
+template IdArray BinaryElewise<kDGLCPU, int32_t, arith::Sub>(
+    IdArray lhs, IdArray rhs);
+template IdArray BinaryElewise<kDGLCPU, int32_t, arith::Mul>(
+    IdArray lhs, IdArray rhs);
+template IdArray BinaryElewise<kDGLCPU, int32_t, arith::Div>(
+    IdArray lhs, IdArray rhs);
+template IdArray BinaryElewise<kDGLCPU, int32_t, arith::Mod>(
+    IdArray lhs, IdArray rhs);
+template IdArray BinaryElewise<kDGLCPU, int32_t, arith::GT>(
+    IdArray lhs, IdArray rhs);
+template IdArray BinaryElewise<kDGLCPU, int32_t, arith::LT>(
+    IdArray lhs, IdArray rhs);
+template IdArray BinaryElewise<kDGLCPU, int32_t, arith::GE>(
+    IdArray lhs, IdArray rhs);
+template IdArray BinaryElewise<kDGLCPU, int32_t, arith::LE>(
+    IdArray lhs, IdArray rhs);
+template IdArray BinaryElewise<kDGLCPU, int32_t, arith::EQ>(
+    IdArray lhs, IdArray rhs);
+template IdArray BinaryElewise<kDGLCPU, int32_t, arith::NE>(
+    IdArray lhs, IdArray rhs);
+template IdArray BinaryElewise<kDGLCPU, int64_t, arith::Add>(
+    IdArray lhs, IdArray rhs);
+template IdArray BinaryElewise<kDGLCPU, int64_t, arith::Sub>(
+    IdArray lhs, IdArray rhs);
+template IdArray BinaryElewise<kDGLCPU, int64_t, arith::Mul>(
+    IdArray lhs, IdArray rhs);
+template IdArray BinaryElewise<kDGLCPU, int64_t, arith::Div>(
+    IdArray lhs, IdArray rhs);
+template IdArray BinaryElewise<kDGLCPU, int64_t, arith::Mod>(
+    IdArray lhs, IdArray rhs);
+template IdArray BinaryElewise<kDGLCPU, int64_t, arith::GT>(
+    IdArray lhs, IdArray rhs);
+template IdArray BinaryElewise<kDGLCPU, int64_t, arith::LT>(
+    IdArray lhs, IdArray rhs);
+template IdArray BinaryElewise<kDGLCPU, int64_t, arith::GE>(
+    IdArray lhs, IdArray rhs);
+template IdArray BinaryElewise<kDGLCPU, int64_t, arith::LE>(
+    IdArray lhs, IdArray rhs);
+template IdArray BinaryElewise<kDGLCPU, int64_t, arith::EQ>(
+    IdArray lhs, IdArray rhs);
+template IdArray BinaryElewise<kDGLCPU, int64_t, arith::NE>(
+    IdArray lhs, IdArray rhs);

 template <DGLDeviceType XPU, typename IdType, typename Op>
 IdArray BinaryElewise(IdArray lhs, IdType rhs) {
  IdArray ret = NewIdArray(lhs->shape[0], lhs->ctx, lhs->dtype.bits);
  const IdType* lhs_data = static_cast<IdType*>(lhs->data);
  IdType* ret_data = static_cast<IdType*>(ret->data);
-  // TODO(BarclayII): this usually incurs lots of overhead in thread spawning, scheduling,
-  // etc., especially since the workload is very light.  Need to replace with parallel_for.
+  // TODO(BarclayII): this usually incurs lots of overhead in thread spawning,
+  // scheduling, etc., especially since the workload is very light.  Need to
+  // replace with parallel_for.
  for (int64_t i = 0; i < lhs->shape[0]; i++) {
    ret_data[i] = Op::Call(lhs_data[i], rhs);
  }
  return ret;
 }

-template IdArray BinaryElewise<kDGLCPU, int32_t, arith::Add>(IdArray lhs, int32_t rhs);
-template IdArray BinaryElewise<kDGLCPU, int32_t, arith::Sub>(IdArray lhs, int32_t rhs);
-template IdArray BinaryElewise<kDGLCPU, int32_t, arith::Mul>(IdArray lhs, int32_t rhs);
-template IdArray BinaryElewise<kDGLCPU, int32_t, arith::Div>(IdArray lhs, int32_t rhs);
-template IdArray BinaryElewise<kDGLCPU, int32_t, arith::Mod>(IdArray lhs, int32_t rhs);
-template IdArray BinaryElewise<kDGLCPU, int32_t, arith::GT>(IdArray lhs, int32_t rhs);
-template IdArray BinaryElewise<kDGLCPU, int32_t, arith::LT>(IdArray lhs, int32_t rhs);
-template IdArray BinaryElewise<kDGLCPU, int32_t, arith::GE>(IdArray lhs, int32_t rhs);
-template IdArray BinaryElewise<kDGLCPU, int32_t, arith::LE>(IdArray lhs, int32_t rhs);
-template IdArray BinaryElewise<kDGLCPU, int32_t, arith::EQ>(IdArray lhs, int32_t rhs);
-template IdArray BinaryElewise<kDGLCPU, int32_t, arith::NE>(IdArray lhs, int32_t rhs);
-template IdArray BinaryElewise<kDGLCPU, int64_t, arith::Add>(IdArray lhs, int64_t rhs);
-template IdArray BinaryElewise<kDGLCPU, int64_t, arith::Sub>(IdArray lhs, int64_t rhs);
-template IdArray BinaryElewise<kDGLCPU, int64_t, arith::Mul>(IdArray lhs, int64_t rhs);
-template IdArray BinaryElewise<kDGLCPU, int64_t, arith::Div>(IdArray lhs, int64_t rhs);
-template IdArray BinaryElewise<kDGLCPU, int64_t, arith::Mod>(IdArray lhs, int64_t rhs);
-template IdArray BinaryElewise<kDGLCPU, int64_t, arith::GT>(IdArray lhs, int64_t rhs);
-template IdArray BinaryElewise<kDGLCPU, int64_t, arith::LT>(IdArray lhs, int64_t rhs);
-template IdArray BinaryElewise<kDGLCPU, int64_t, arith::GE>(IdArray lhs, int64_t rhs);
-template IdArray BinaryElewise<kDGLCPU, int64_t, arith::LE>(IdArray lhs, int64_t rhs);
-template IdArray BinaryElewise<kDGLCPU, int64_t, arith::EQ>(IdArray lhs, int64_t rhs);
-template IdArray BinaryElewise<kDGLCPU, int64_t, arith::NE>(IdArray lhs, int64_t rhs);
+template IdArray BinaryElewise<kDGLCPU, int32_t, arith::Add>(
+    IdArray lhs, int32_t rhs);
+template IdArray BinaryElewise<kDGLCPU, int32_t, arith::Sub>(
+    IdArray lhs, int32_t rhs);
+template IdArray BinaryElewise<kDGLCPU, int32_t, arith::Mul>(
+    IdArray lhs, int32_t rhs);
+template IdArray BinaryElewise<kDGLCPU, int32_t, arith::Div>(
+    IdArray lhs, int32_t rhs);
+template IdArray BinaryElewise<kDGLCPU, int32_t, arith::Mod>(
+    IdArray lhs, int32_t rhs);
+template IdArray BinaryElewise<kDGLCPU, int32_t, arith::GT>(
+    IdArray lhs, int32_t rhs);
+template IdArray BinaryElewise<kDGLCPU, int32_t, arith::LT>(
+    IdArray lhs, int32_t rhs);
+template IdArray BinaryElewise<kDGLCPU, int32_t, arith::GE>(
+    IdArray lhs, int32_t rhs);
+template IdArray BinaryElewise<kDGLCPU, int32_t, arith::LE>(
+    IdArray lhs, int32_t rhs);
+template IdArray BinaryElewise<kDGLCPU, int32_t, arith::EQ>(
+    IdArray lhs, int32_t rhs);
+template IdArray BinaryElewise<kDGLCPU, int32_t, arith::NE>(
+    IdArray lhs, int32_t rhs);
+template IdArray BinaryElewise<kDGLCPU, int64_t, arith::Add>(
+    IdArray lhs, int64_t rhs);
+template IdArray BinaryElewise<kDGLCPU, int64_t, arith::Sub>(
+    IdArray lhs, int64_t rhs);
+template IdArray BinaryElewise<kDGLCPU, int64_t, arith::Mul>(
+    IdArray lhs, int64_t rhs);
+template IdArray BinaryElewise<kDGLCPU, int64_t, arith::Div>(
+    IdArray lhs, int64_t rhs);
+template IdArray BinaryElewise<kDGLCPU, int64_t, arith::Mod>(
+    IdArray lhs, int64_t rhs);
+template IdArray BinaryElewise<kDGLCPU, int64_t, arith::GT>(
+    IdArray lhs, int64_t rhs);
+template IdArray BinaryElewise<kDGLCPU, int64_t, arith::LT>(
+    IdArray lhs, int64_t rhs);
+template IdArray BinaryElewise<kDGLCPU, int64_t, arith::GE>(
+    IdArray lhs, int64_t rhs);
+template IdArray BinaryElewise<kDGLCPU, int64_t, arith::LE>(
+    IdArray lhs, int64_t rhs);
+template IdArray BinaryElewise<kDGLCPU, int64_t, arith::EQ>(
+    IdArray lhs, int64_t rhs);
+template IdArray BinaryElewise<kDGLCPU, int64_t, arith::NE>(
+    IdArray lhs, int64_t rhs);

 template <DGLDeviceType XPU, typename IdType, typename Op>
 IdArray BinaryElewise(IdType lhs, IdArray rhs) {
  IdArray ret = NewIdArray(rhs->shape[0], rhs->ctx, rhs->dtype.bits);
  const IdType* rhs_data = static_cast<IdType*>(rhs->data);
  IdType* ret_data = static_cast<IdType*>(ret->data);
-  // TODO(BarclayII): this usually incurs lots of overhead in thread spawning, scheduling,
-  // etc., especially since the workload is very light.  Need to replace with parallel_for.
+  // TODO(BarclayII): this usually incurs lots of overhead in thread spawning,
+  // scheduling, etc., especially since the workload is very light.  Need to
+  // replace with parallel_for.
  for (int64_t i = 0; i < rhs->shape[0]; i++) {
    ret_data[i] = Op::Call(lhs, rhs_data[i]);
  }
  return ret;
 }

-template IdArray BinaryElewise<kDGLCPU, int32_t, arith::Add>(int32_t lhs, IdArray rhs);
-template IdArray BinaryElewise<kDGLCPU, int32_t, arith::Sub>(int32_t lhs, IdArray rhs);
-template IdArray BinaryElewise<kDGLCPU, int32_t, arith::Mul>(int32_t lhs, IdArray rhs);
-template IdArray BinaryElewise<kDGLCPU, int32_t, arith::Div>(int32_t lhs, IdArray rhs);
-template IdArray BinaryElewise<kDGLCPU, int32_t, arith::Mod>(int32_t lhs, IdArray rhs);
-template IdArray BinaryElewise<kDGLCPU, int32_t, arith::GT>(int32_t lhs, IdArray rhs);
-template IdArray BinaryElewise<kDGLCPU, int32_t, arith::LT>(int32_t lhs, IdArray rhs);
-template IdArray BinaryElewise<kDGLCPU, int32_t, arith::GE>(int32_t lhs, IdArray rhs);
-template IdArray BinaryElewise<kDGLCPU, int32_t, arith::LE>(int32_t lhs, IdArray rhs);
-template IdArray BinaryElewise<kDGLCPU, int32_t, arith::EQ>(int32_t lhs, IdArray rhs);
-template IdArray BinaryElewise<kDGLCPU, int32_t, arith::NE>(int32_t lhs, IdArray rhs);
-template IdArray BinaryElewise<kDGLCPU, int64_t, arith::Add>(int64_t lhs, IdArray rhs);
-template IdArray BinaryElewise<kDGLCPU, int64_t, arith::Sub>(int64_t lhs, IdArray rhs);
-template IdArray BinaryElewise<kDGLCPU, int64_t, arith::Mul>(int64_t lhs, IdArray rhs);
-template IdArray BinaryElewise<kDGLCPU, int64_t, arith::Div>(int64_t lhs, IdArray rhs);
-template IdArray BinaryElewise<kDGLCPU, int64_t, arith::Mod>(int64_t lhs, IdArray rhs);
-template IdArray BinaryElewise<kDGLCPU, int64_t, arith::GT>(int64_t lhs, IdArray rhs);
-template IdArray BinaryElewise<kDGLCPU, int64_t, arith::LT>(int64_t lhs, IdArray rhs);
-template IdArray BinaryElewise<kDGLCPU, int64_t, arith::GE>(int64_t lhs, IdArray rhs);
-template IdArray BinaryElewise<kDGLCPU, int64_t, arith::LE>(int64_t lhs, IdArray rhs);
-template IdArray BinaryElewise<kDGLCPU, int64_t, arith::EQ>(int64_t lhs, IdArray rhs);
-template IdArray BinaryElewise<kDGLCPU, int64_t, arith::NE>(int64_t lhs, IdArray rhs);
+template IdArray BinaryElewise<kDGLCPU, int32_t, arith::Add>(
+    int32_t lhs, IdArray rhs);
+template IdArray BinaryElewise<kDGLCPU, int32_t, arith::Sub>(
+    int32_t lhs, IdArray rhs);
+template IdArray BinaryElewise<kDGLCPU, int32_t, arith::Mul>(
+    int32_t lhs, IdArray rhs);
+template IdArray BinaryElewise<kDGLCPU, int32_t, arith::Div>(
+    int32_t lhs, IdArray rhs);
+template IdArray BinaryElewise<kDGLCPU, int32_t, arith::Mod>(
+    int32_t lhs, IdArray rhs);
+template IdArray BinaryElewise<kDGLCPU, int32_t, arith::GT>(
+    int32_t lhs, IdArray rhs);
+template IdArray BinaryElewise<kDGLCPU, int32_t, arith::LT>(
+    int32_t lhs, IdArray rhs);
+template IdArray BinaryElewise<kDGLCPU, int32_t, arith::GE>(
+    int32_t lhs, IdArray rhs);
+template IdArray BinaryElewise<kDGLCPU, int32_t, arith::LE>(
+    int32_t lhs, IdArray rhs);
+template IdArray BinaryElewise<kDGLCPU, int32_t, arith::EQ>(
+    int32_t lhs, IdArray rhs);
+template IdArray BinaryElewise<kDGLCPU, int32_t, arith::NE>(
+    int32_t lhs, IdArray rhs);
+template IdArray BinaryElewise<kDGLCPU, int64_t, arith::Add>(
+    int64_t lhs, IdArray rhs);
+template IdArray BinaryElewise<kDGLCPU, int64_t, arith::Sub>(
+    int64_t lhs, IdArray rhs);
+template IdArray BinaryElewise<kDGLCPU, int64_t, arith::Mul>(
+    int64_t lhs, IdArray rhs);
+template IdArray BinaryElewise<kDGLCPU, int64_t, arith::Div>(
+    int64_t lhs, IdArray rhs);
+template IdArray BinaryElewise<kDGLCPU, int64_t, arith::Mod>(
+    int64_t lhs, IdArray rhs);
+template IdArray BinaryElewise<kDGLCPU, int64_t, arith::GT>(
+    int64_t lhs, IdArray rhs);
+template IdArray BinaryElewise<kDGLCPU, int64_t, arith::LT>(
+    int64_t lhs, IdArray rhs);
+template IdArray BinaryElewise<kDGLCPU, int64_t, arith::GE>(
+    int64_t lhs, IdArray rhs);
+template IdArray BinaryElewise<kDGLCPU, int64_t, arith::LE>(
+    int64_t lhs, IdArray rhs);
+template IdArray BinaryElewise<kDGLCPU, int64_t, arith::EQ>(
+    int64_t lhs, IdArray rhs);
+template IdArray BinaryElewise<kDGLCPU, int64_t, arith::NE>(
+    int64_t lhs, IdArray rhs);

 template <DGLDeviceType XPU, typename IdType, typename Op>
 IdArray UnaryElewise(IdArray lhs) {
  IdArray ret = NewIdArray(lhs->shape[0], lhs->ctx, lhs->dtype.bits);
  const IdType* lhs_data = static_cast<IdType*>(lhs->data);
  IdType* ret_data = static_cast<IdType*>(ret->data);
-  // TODO(BarclayII): this usually incurs lots of overhead in thread spawning, scheduling,
-  // etc., especially since the workload is very light.  Need to replace with parallel_for.
+  // TODO(BarclayII): this usually incurs lots of overhead in thread spawning,
+  // scheduling, etc., especially since the workload is very light.  Need to
+  // replace with parallel_for.
  for (int64_t i = 0; i < lhs->shape[0]; i++) {
    ret_data[i] = Op::Call(lhs_data[i]);
  }
@@ -180,10 +252,14 @@ NDArray Full(DType val, int64_t length, DGLContext ctx) {
  return ret;
 }

-template NDArray Full<kDGLCPU, int32_t>(int32_t val, int64_t length, DGLContext ctx);
-template NDArray Full<kDGLCPU, int64_t>(int64_t val, int64_t length, DGLContext ctx);
-template NDArray Full<kDGLCPU, float>(float val, int64_t length, DGLContext ctx);
-template NDArray Full<kDGLCPU, double>(double val, int64_t length, DGLContext ctx);
+template NDArray Full<kDGLCPU, int32_t>(
+    int32_t val, int64_t length, DGLContext ctx);
+template NDArray Full<kDGLCPU, int64_t>(
+    int64_t val, int64_t length, DGLContext ctx);
+template NDArray Full<kDGLCPU, float>(
+    float val, int64_t length, DGLContext ctx);
+template NDArray Full<kDGLCPU, double>(
+    double val, int64_t length, DGLContext ctx);

 ///////////////////////////// Range /////////////////////////////

@@ -216,7 +292,8 @@ IdArray Relabel_(const std::vector<IdArray>& arrays) {
    }
  }
  // map array
-  IdArray maparr = NewIdArray(newid, DGLContext{kDGLCPU, 0}, sizeof(IdType) * 8);
+  IdArray maparr =
+      NewIdArray(newid, DGLContext{kDGLCPU, 0}, sizeof(IdType) * 8);
  IdType* maparr_data = static_cast<IdType*>(maparr->data);
  for (const auto& kv : oldv2newv) {
    maparr_data[kv.second] = kv.first;