[Feature] Add edge coarsening for homogeneous undirected graphs (#2691)

* finish graph matching gpu version * use C++ shuffle * finish graph matching * fix bug * fix bug * change name and use swap * upt * fix format problem * fix format problem * stronger test * upt * upt * change python api * upt * upt * format check * upt * upt * fix bug Co-authored-by: Tong He <hetong007@gmail.com>

[Feature] Add edge coarsening for homogeneous undirected graphs (#2691)
* finish graph matching gpu version * use C++ shuffle * finish graph matching * fix bug * fix bug * change name and use swap * upt * fix format problem * fix format problem * stronger test * upt * upt * change python api * upt * upt * format check * upt * upt * fix bug Co-authored-by: Tong He <hetong007@gmail.com>
c88fca50 · Tianqi Zhang (张天启) · GitHub · 91cb3477 · c88fca50 · c88fca50
Unverified Commit c88fca50 authored Mar 09, 2021 by Tianqi Zhang (张天启) Committed by GitHub Mar 09, 2021
18 changed files
--- a/cmake/modules/CUDA.cmake
+++ b/cmake/modules/CUDA.cmake
@@ -272,7 +272,8 @@ macro(dgl_config_cuda out_variable)
  list(APPEND DGL_LINKER_LIBS
    ${CUDA_CUDART_LIBRARY}
    ${CUDA_CUBLAS_LIBRARIES}
-    ${CUDA_cusparse_LIBRARY})
+    ${CUDA_cusparse_LIBRARY}
+    ${CUDA_CURAND_LIBRARY})
  set(${out_variable} ${DGL_CUDA_SRC})
 endmacro()
--- a/cmake/util/FindCUDA.cmake
+++ b/cmake/util/FindCUDA.cmake
@@ -40,6 +40,9 @@ macro(find_cuda use_cuda)
      find_library(CUDA_CUBLAS_LIBRARY cublas
        ${CUDA_TOOLKIT_ROOT_DIR}/lib/x64
        ${CUDA_TOOLKIT_ROOT_DIR}/lib/Win32)
+      find_library(CUDA_CURAND_LIBRARY curand
+        ${CUDA_TOOLKIT_ROOT_DIR}/lib/x64
+        ${CUDA_TOOLKIT_ROOT_DIR}/lib/Win32)
    else(MSVC)
      #find_library(CUDA_CUDA_LIBRARY cuda
      #  PATHS ${CUDA_TOOLKIT_ROOT_DIR}
@@ -48,6 +51,9 @@ macro(find_cuda use_cuda)
      find_library(CUDA_CUBLAS_LIBRARY cublas
        ${CUDA_TOOLKIT_ROOT_DIR}/lib64
        ${CUDA_TOOLKIT_ROOT_DIR}/lib)
+      find_library(CUDA_CURAND_LIBRARY curand
+        ${CUDA_TOOLKIT_ROOT_DIR}/lib64
+        ${CUDA_TOOLKIT_ROOT_DIR}/lib)
    endif(MSVC)
    message(STATUS "Found CUDA_TOOLKIT_ROOT_DIR=" ${CUDA_TOOLKIT_ROOT_DIR})
    #message(STATUS "Found CUDA_CUDA_LIBRARY=" ${CUDA_CUDA_LIBRARY})
@@ -55,5 +61,6 @@ macro(find_cuda use_cuda)
    #message(STATUS "Found CUDA_NVRTC_LIBRARY=" ${CUDA_NVRTC_LIBRARY})
    #message(STATUS "Found CUDA_CUDNN_LIBRARY=" ${CUDA_CUDNN_LIBRARY})
    message(STATUS "Found CUDA_CUBLAS_LIBRARY=" ${CUDA_CUBLAS_LIBRARY})
+    message(STATUS "Found CUDA_CURAND_LIBRARY=" ${CUDA_CURAND_LIBRARY})
  endif(CUDA_FOUND)
 endmacro(find_cuda)
--- a/python/dgl/geometry/__init__.py
+++ b/python/dgl/geometry/__init__.py
@@ -3,6 +3,7 @@ import importlib
 import sys
 from ..backend import backend_name
 def _load_backend(mod_name):
    mod = importlib.import_module('.%s' % mod_name, __name__)
    thismod = sys.modules[__name__]

--- a/python/dgl/geometry/capi.py
+++ b/python/dgl/geometry/capi.py
 """Python interfaces to DGL farthest point sampler."""
+from dgl._ffi.base import DGLError
+import numpy as np
 from .._ffi.function import _init_api
 from .. import backend as F
+from .. import ndarray as nd
 def farthest_point_sampler(data, batch_size, sample_points, dist, start_idx, result):
-    """Farthest Point Sampler
+    r"""Farthest Point Sampler
    Parameters
    ----------
@@ -34,4 +38,65 @@ def farthest_point_sampler(data, batch_size, sample_points, dist, start_idx, res
                               F.zerocopy_to_dgl_ndarray(start_idx),
                               F.zerocopy_to_dgl_ndarray(result))
+def _neighbor_matching(graph_idx, num_nodes, edge_weights=None, relabel_idx=True):
+    """
+    Description
+    -----------
+    The neighbor matching procedure of edge coarsening used in
+    `Metis <http://cacs.usc.edu/education/cs653/Karypis-METIS-SIAMJSC98.pdf>`__
+    and
+    `Graclus <https://www.cs.utexas.edu/users/inderjit/public_papers/multilevel_pami.pdf>`__
+    for homogeneous graph coarsening. This procedure keeps picking an unmarked
+    vertex and matching it with one its unmarked neighbors (that maximizes its
+    edge weight) until no match can be done.
+    If no edge weight is given, this procedure will randomly pick neighbor for each
+    vertex.
+    The GPU implementation is based on `A GPU Algorithm for Greedy Graph Matching
+    <http://www.staff.science.uu.nl/~bisse101/Articles/match12.pdf>`__
+    NOTE: The input graph must be bi-directed (undirected) graph. Call :obj:`dgl.to_bidirected`
+    if you are not sure your graph is bi-directed.
+    Parameters
+    ----------
+    graph : HeteroGraphIndex
+        The input homogeneous graph.
+    num_nodes : int
+        The number of nodes in this homogeneous graph.
+    edge_weight : tensor, optional
+        The edge weight tensor holding non-negative scalar weight for each edge.
+        default: :obj:`None`
+    relabel_idx : bool, optional
+        If true, relabel resulting node labels to have consecutive node ids.
+        default: :obj:`True`
+    Returns
+    -------
+    a 1-D tensor
+        A vector with each element that indicates the cluster ID of a vertex.
+    """
+    edge_weight_capi = nd.NULL["int64"]
+    if edge_weights is not None:
+        edge_weight_capi = F.zerocopy_to_dgl_ndarray(edge_weights)
+    node_label = F.full_1d(
+        num_nodes, -1, getattr(F, graph_idx.dtype), F.to_backend_ctx(graph_idx.ctx))
+    node_label_capi = F.zerocopy_to_dgl_ndarray_for_write(node_label)
+    _CAPI_NeighborMatching(graph_idx, edge_weight_capi, node_label_capi)
+    if F.reduce_sum(node_label < 0).item() != 0:
+        raise DGLError("Find unmatched node")
+    # reorder node id
+    # TODO: actually we can add `return_inverse` option for `unique`
+    #       function in backend for efficiency.
+    if relabel_idx:
+        node_label_np = F.zerocopy_to_numpy(node_label)
+        _, node_label_np = np.unique(node_label_np, return_inverse=True)
+        return F.tensor(node_label_np)
+    else:
+        return node_label
 _init_api('dgl.geometry', __name__)
--- a/python/dgl/geometry/mxnet/__init__.py
+++ b/python/dgl/geometry/mxnet/__init__.py
 """Package for mxnet-specific Geometry modules."""
 from .fps import *
+from .edge_coarsening import *
--- a/python/dgl/geometry/mxnet/edge_coarsening.py
+++ b/python/dgl/geometry/mxnet/edge_coarsening.py
+"""Edge coarsening procedure used in Metis and Graclus, for mxnet"""
+# pylint: disable=no-member, invalid-name, W0235
+import dgl
+import mxnet as mx
+from ..capi import _neighbor_matching
+__all__ = ['neighbor_matching']
+class NeighborMatchingFn(mx.autograd.Function):
+    r"""
+    Description
+    -----------
+    AutoGrad function for neighbor matching
+    """
+    def __init__(self, gidx, num_nodes, e_weights, relabel_idx):
+        super(NeighborMatchingFn, self).__init__()
+        self.gidx = gidx
+        self.num_nodes = num_nodes
+        self.e_weights = e_weights
+        self.relabel_idx = relabel_idx
+    def forward(self):
+        r"""
+        Description
+        -----------
+        Perform forward computation
+        """
+        return _neighbor_matching(
+            self.gidx, self.num_nodes, self.e_weights, self.relabel_idx)
+    def backward(self):
+        r"""
+        Description
+        -----------
+        Perform backward computation
+        """
+        pass # pylint: disable=unnecessary-pass
+def neighbor_matching(graph, e_weights, relabel_idx):
+    r"""
+    Description
+    -----------
+    The neighbor matching procedure of edge coarsening in
+    `Metis <http://cacs.usc.edu/education/cs653/Karypis-METIS-SIAMJSC98.pdf>`__
+    and
+    `Graclus <https://www.cs.utexas.edu/users/inderjit/public_papers/multilevel_pami.pdf>`__
+    for homogeneous graph coarsening. This procedure keeps picking an unmarked
+    vertex and matching it with one its unmarked neighbors (that maximizes its
+    edge weight) until no match can be done.
+    If no edge weight is given, this procedure will randomly pick neighbor for each
+    vertex.
+    The GPU implementation is based on `A GPU Algorithm for Greedy Graph Matching
+    <http://www.staff.science.uu.nl/~bisse101/Articles/match12.pdf>`__
+    NOTE: The input graph must be bi-directed (undirected) graph. Call :obj:`dgl.to_bidirected`
+          if you are not sure your graph is bi-directed.
+    Parameters
+    ----------
+    graph : DGLGraph
+        The input homogeneous graph.
+    edge_weight : mxnet.NDArray, optional
+        The edge weight tensor holding non-negative scalar weight for each edge.
+        default: :obj:`None`
+    relabel_idx : bool, optional
+        If true, relabel resulting node labels to have consecutive node ids.
+        default: :obj:`True`
+    """
+    assert graph.is_homogeneous, \
+        "The graph used in graph node matching must be homogeneous"
+    if e_weights is not None:
+        graph.edata['e_weights'] = e_weights
+        graph = dgl.remove_self_loop(graph)
+        e_weights = graph.edata['e_weights']
+        graph.edata.pop('e_weights')
+    else:
+        graph = dgl.remove_self_loop(graph)
+    func = NeighborMatchingFn(graph._graph, graph.num_nodes(), e_weights, relabel_idx)
+    return func()
--- a/python/dgl/geometry/pytorch/__init__.py
+++ b/python/dgl/geometry/pytorch/__init__.py
-"""Package for mxnet-specific Geometry modules."""
+"""Package for pytorch-specific Geometry modules."""
 from .fps import *
+from .edge_coarsening import *
--- a/python/dgl/geometry/pytorch/edge_coarsening.py
+++ b/python/dgl/geometry/pytorch/edge_coarsening.py
+"""Edge coarsening procedure used in Metis and Graclus, for pytorch"""
+# pylint: disable=no-member, invalid-name, W0613
+import dgl
+import torch as th
+from ..capi import _neighbor_matching
+__all__ = ['neighbor_matching']
+class NeighborMatchingFn(th.autograd.Function):
+    r"""
+    Description
+    -----------
+    AutoGrad function for neighbor matching
+    """
+    @staticmethod
+    def forward(ctx, gidx, num_nodes, e_weights, relabel_idx):
+        r"""
+        Description
+        -----------
+        Perform forward computation
+        """
+        return _neighbor_matching(gidx, num_nodes, e_weights, relabel_idx)
+    @staticmethod
+    def backward(ctx):
+        r"""
+        Description
+        -----------
+        Perform backward computation
+        """
+        pass # pylint: disable=unnecessary-pass
+def neighbor_matching(graph, e_weights=None, relabel_idx=True):
+    r"""
+    Description
+    -----------
+    The neighbor matching procedure of edge coarsening in
+    `Metis <http://cacs.usc.edu/education/cs653/Karypis-METIS-SIAMJSC98.pdf>`__
+    and
+    `Graclus <https://www.cs.utexas.edu/users/inderjit/public_papers/multilevel_pami.pdf>`__
+    for homogeneous graph coarsening. This procedure keeps picking an unmarked
+    vertex and matching it with one its unmarked neighbors (that maximizes its
+    edge weight) until no match can be done.
+    If no edge weight is given, this procedure will randomly pick neighbor for each
+    vertex.
+    The GPU implementation is based on `A GPU Algorithm for Greedy Graph Matching
+    <http://www.staff.science.uu.nl/~bisse101/Articles/match12.pdf>`__
+    NOTE: The input graph must be bi-directed (undirected) graph. Call :obj:`dgl.to_bidirected`
+          if you are not sure your graph is bi-directed.
+    Parameters
+    ----------
+    graph : DGLGraph
+        The input homogeneous graph.
+    edge_weight : torch.Tensor, optional
+        The edge weight tensor holding non-negative scalar weight for each edge.
+        default: :obj:`None`
+    relabel_idx : bool, optional
+        If true, relabel resulting node labels to have consecutive node ids.
+        default: :obj:`True`
+    """
+    assert graph.is_homogeneous, \
+        "The graph used in graph node matching must be homogeneous"
+    if e_weights is not None:
+        graph.edata['e_weights'] = e_weights
+        graph = dgl.remove_self_loop(graph)
+        e_weights = graph.edata['e_weights']
+        graph.edata.pop('e_weights')
+    else:
+        graph = dgl.remove_self_loop(graph)
+    return NeighborMatchingFn.apply(graph._graph, graph.num_nodes(), e_weights, relabel_idx)
--- a/src/array/check.h
+++ b/src/array/check.h
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file array/check.h
+ * \brief DGL check utilities
+ */
+#ifndef DGL_ARRAY_CHECK_H_
+#define DGL_ARRAY_CHECK_H_
+#include <dgl/runtime/ndarray.h>
+#include <dgl/array.h>
+#include <vector>
+#include <string>
+namespace dgl {
+namespace aten {
+// Check whether the given arguments have the same context.
+inline void CheckCtx(
+    const DLContext& ctx,
+    const std::vector<NDArray>& arrays,
+    const std::vector<std::string>& names) {
+  for (size_t i = 0; i < arrays.size(); ++i) {
+    if (IsNullArray(arrays[i]))
+      continue;
+    CHECK_EQ(ctx, arrays[i]->ctx)
+      << "Expected device context " << ctx << ". But got "
+      << arrays[i]->ctx << " for " << names[i] << ".";
+  }
+}
+// Check whether input tensors are contiguous.
+inline void CheckContiguous(
+    const std::vector<NDArray>& arrays,
+    const std::vector<std::string>& names) {
+  for (size_t i = 0; i < arrays.size(); ++i) {
+    if (IsNullArray(arrays[i]))
+      continue;
+    CHECK(arrays[i].IsContiguous())
+      << "Expect " << names[i] << " to be a contiguous tensor";
+  }
+}
+// Check whether input tensors have valid shape.
+inline void CheckShape(
+    const std::vector<uint64_t>& gdim,
+    const std::vector<int>& uev_idx,
+    const std::vector<NDArray>& arrays,
+    const std::vector<std::string>& names) {
+  for (size_t i = 0; i < arrays.size(); ++i) {
+    if (IsNullArray(arrays[i]))
+      continue;
+    CHECK_GE(arrays[i]->ndim, 2)
+      << "Expect " << names[i] << " to have ndim >= 2, "
+      << "Note that for scalar feature we expand its "
+      << "dimension with an additional dimension of "
+      << "length one.";
+    CHECK_EQ(gdim[uev_idx[i]], arrays[i]->shape[0])
+      << "Expect " << names[i] << " to have size "
+      << gdim[uev_idx[i]] << " on the first dimension, "
+      << "but got " << arrays[i]->shape[0];
+  }
+}
+}  // namespace aten
+}  // namespace dgl
+#endif  // DGL_ARRAY_CHECK_H_
--- a/src/array/kernel.cc
+++ b/src/array/kernel.cc
@@ -12,6 +12,7 @@
 #include "kernel_decl.h"
 #include "../c_api_common.h"
+#include "./check.h"
 using namespace dgl::runtime;
@@ -19,53 +20,6 @@ namespace dgl {
 namespace aten {
 namespace {
-// Check whether the given arguments have the same context.
-inline void CheckCtx(
-    const DLContext& ctx,
-    const std::vector<NDArray>& arrays,
-    const std::vector<std::string>& names) {
-  for (size_t i = 0; i < arrays.size(); ++i) {
-    if (IsNullArray(arrays[i]))
-      continue;
-    CHECK_EQ(ctx, arrays[i]->ctx)
-      << "Expected device context " << ctx << ". But got "
-      << arrays[i]->ctx << " for " << names[i] << ".";
-  }
-}
-// Check whether input tensors are contiguous.
-inline void CheckContiguous(
-    const std::vector<NDArray>& arrays,
-    const std::vector<std::string>& names) {
-  for (size_t i = 0; i < arrays.size(); ++i) {
-    if (IsNullArray(arrays[i]))
-      continue;
-    CHECK(arrays[i].IsContiguous())
-      << "Expect " << names[i] << " to be a contiguous tensor";
-  }
-}
-// Check whether input tensors have valid shape.
-inline void CheckShape(
-    const std::vector<uint64_t>& gdim,
-    const std::vector<int>& uev_idx,
-    const std::vector<NDArray>& arrays,
-    const std::vector<std::string>& names) {
-  for (size_t i = 0; i < arrays.size(); ++i) {
-    if (IsNullArray(arrays[i]))
-      continue;
-    CHECK_GE(arrays[i]->ndim, 2)
-      << "Expect " << names[i] << " to have ndim >= 2, "
-      << "Note that for scalar feature we expand its "
-      << "dimension with an additional dimension of "
-      << "length one.";
-    CHECK_EQ(gdim[uev_idx[i]], arrays[i]->shape[0])
-      << "Expect " << names[i] << " to have size "
-      << gdim[uev_idx[i]] << " on the first dimension, "
-      << "but got " << arrays[i]->shape[0];
-  }
-}
 }  // namespace
 /*! \brief Generalized Sparse Matrix-Matrix Multiplication. */

--- a/src/geometry/cpu/geometry_op_impl.cc
+++ b/src/geometry/cpu/geometry_op_impl.cc
@@ -3,15 +3,69 @@
 * \file array/cpu/geometry_op_impl.cc
 * \brief Geometry operator CPU implementation
 */
-#include <dgl/array.h>
+#include <dgl/random.h>
 #include <numeric>
 #include <vector>
+#include <utility>
+#include "../geometry_op.h"
 namespace dgl {
 using runtime::NDArray;
 namespace geometry {
 namespace impl {
+/*! \brief Knuth shuffle algorithm */
+template <typename IdType>
+void IndexShuffle(IdType *idxs, int64_t num_elems) {
+  for (int64_t i = num_elems - 1; i > 0; --i) {
+    int64_t j = dgl::RandomEngine::ThreadLocal()->RandInt(i);
+    std::swap(idxs[i], idxs[j]);
+  }
+}
+template void IndexShuffle<int32_t>(int32_t *idxs, int64_t num_elems);
+template void IndexShuffle<int64_t>(int64_t *idxs, int64_t num_elems);
+/*! \brief Groupwise index shuffle algorithm. This function will perform shuffle in subarrays
+ * indicated by group index. The group index is similar to indptr in CSRMatrix.
+ * 
+ * \param group_idxs group index array.
+ * \param idxs index array for shuffle.
+ * \param num_groups_idxs length of group_idxs
+ * \param num_elems length of idxs
+ */
+template <typename IdType>
+void GroupIndexShuffle(const IdType *group_idxs, IdType *idxs,
+                       int64_t num_groups_idxs, int64_t num_elems) {
+  if (num_groups_idxs < 2) return;  // empty idxs array
+  CHECK_LE(group_idxs[num_groups_idxs - 1], num_elems) << "group_idxs out of range";
+  for (int64_t i = 0; i < num_groups_idxs - 1; ++i) {
+    auto subarray_len = group_idxs[i + 1] - group_idxs[i];
+    IndexShuffle(idxs + group_idxs[i], subarray_len);
+  }
+}
+template void GroupIndexShuffle<int32_t>(
+    const int32_t *group_idxs, int32_t *idxs, int64_t num_groups_idxs, int64_t num_elems);
+template void GroupIndexShuffle<int64_t>(
+    const int64_t *group_idxs, int64_t *idxs, int64_t num_groups_idxs, int64_t num_elems);
+template <typename IdType>
+IdArray RandomPerm(int64_t num_nodes) {
+  IdArray perm = aten::NewIdArray(num_nodes, DLContext{kDLCPU, 0}, sizeof(IdType) * 8);
+  IdType* perm_data = static_cast<IdType*>(perm->data);
+  std::iota(perm_data, perm_data + num_nodes, 0);
+  IndexShuffle(perm_data, num_nodes);
+  return perm;
+}
+template <typename IdType>
+IdArray GroupRandomPerm(const IdType *group_idxs, int64_t num_group_idxs, int64_t num_nodes) {
+  IdArray perm = aten::NewIdArray(num_nodes, DLContext{kDLCPU, 0}, sizeof(IdType) * 8);
+  IdType* perm_data = static_cast<IdType*>(perm->data);
+  std::iota(perm_data, perm_data + num_nodes, 0);
+  GroupIndexShuffle(group_idxs, perm_data, num_group_idxs, num_nodes);
+  return perm;
+}
 /*!
 * \brief Farthest Point Sampler without the need to compute all pairs of distance.
 * 
@@ -81,7 +135,6 @@ void FarthestPointSampler(NDArray array, int64_t batch_size, int64_t sample_poin
    ret_start += sample_points;
  }
 }
 template void FarthestPointSampler<kDLCPU, float, int32_t>(
    NDArray array, int64_t batch_size, int64_t sample_points,
    NDArray dist, IdArray start_idx, IdArray result);
@@ -95,6 +148,82 @@ template void FarthestPointSampler<kDLCPU, double, int64_t>(
    NDArray array, int64_t batch_size, int64_t sample_points,
    NDArray dist, IdArray start_idx, IdArray result);
+template <DLDeviceType XPU, typename FloatType, typename IdType>
+void WeightedNeighborMatching(const aten::CSRMatrix &csr, const NDArray weight, IdArray result) {
+  const int64_t num_nodes = result->shape[0];
+  const IdType *indptr_data = static_cast<IdType*>(csr.indptr->data);
+  const IdType *indices_data = static_cast<IdType*>(csr.indices->data);
+  IdType *result_data = static_cast<IdType*>(result->data);
+  FloatType *weight_data = static_cast<FloatType*>(weight->data);
+  // build node visiting order
+  IdArray vis_order = RandomPerm<IdType>(num_nodes);
+  IdType *vis_order_data = static_cast<IdType*>(vis_order->data);
+  for (int64_t n = 0; n < num_nodes; ++n) {
+    auto u = vis_order_data[n];
+    // if marked
+    if (result_data[u] >= 0) continue;
+    auto v_max = u;
+    FloatType weight_max = 0;
+    for (auto e = indptr_data[u]; e < indptr_data[u + 1]; ++e) {
+      auto v = indices_data[e];
+      if (result_data[v] >= 0) continue;
+      if (weight_data[e] >= weight_max) {
+        v_max = v;
+        weight_max = weight_data[e];
+      }
+    }
+    result_data[u] = std::min(u, v_max);
+    result_data[v_max] = result_data[u];
+  }
+}
+template void WeightedNeighborMatching<kDLCPU, float, int32_t>(
+    const aten::CSRMatrix &csr, const NDArray weight, IdArray result);
+template void WeightedNeighborMatching<kDLCPU, float, int64_t>(
+    const aten::CSRMatrix &csr, const NDArray weight, IdArray result);
+template void WeightedNeighborMatching<kDLCPU, double, int32_t>(
+    const aten::CSRMatrix &csr, const NDArray weight, IdArray result);
+template void WeightedNeighborMatching<kDLCPU, double, int64_t>(
+    const aten::CSRMatrix &csr, const NDArray weight, IdArray result);
+template <DLDeviceType XPU, typename IdType>
+void NeighborMatching(const aten::CSRMatrix &csr, IdArray result) {
+  const int64_t num_nodes = result->shape[0];
+  const IdType *indptr_data = static_cast<IdType*>(csr.indptr->data);
+  const IdType *indices_data = static_cast<IdType*>(csr.indices->data);
+  IdType *result_data = static_cast<IdType*>(result->data);
+  // build vis order
+  IdArray u_vis_order = RandomPerm<IdType>(num_nodes);
+  IdType *u_vis_order_data = static_cast<IdType*>(u_vis_order->data);
+  IdArray v_vis_order = GroupRandomPerm<IdType>(
+    indptr_data, csr.indptr->shape[0], csr.indices->shape[0]);
+  IdType *v_vis_order_data = static_cast<IdType*>(v_vis_order->data);
+  for (int64_t n = 0; n < num_nodes; ++n) {
+    auto u = u_vis_order_data[n];
+    // if marked
+    if (result_data[u] >= 0) continue;
+    result_data[u] = u;
+    for (auto e = indptr_data[u]; e < indptr_data[u + 1]; ++e) {
+      auto v = indices_data[v_vis_order_data[e]];
+      if (result_data[v] >= 0) continue;
+      result_data[u] = std::min(u, v);
+      result_data[v] = result_data[u];
+      break;
+    }
+  }
+}
+template void NeighborMatching<kDLCPU, int32_t>(const aten::CSRMatrix &csr, IdArray result);
+template void NeighborMatching<kDLCPU, int64_t>(const aten::CSRMatrix &csr, IdArray result);
 }  // namespace impl
 }  // namespace geometry
 }  // namespace dgl
--- a/src/geometry/cuda/edge_coarsening_impl.cu
+++ b/src/geometry/cuda/edge_coarsening_impl.cu
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file geometry/cuda/edge_coarsening_impl.cu
+ * \brief Edge coarsening CUDA implementation
+ */
+#include <dgl/array.h>
+#include <dgl/random.h>
+#include <dmlc/thread_local.h>
+#include <curand.h>
+#include <cstdint>
+#include "../geometry_op.h"
+#include "../../runtime/cuda/cuda_common.h"
+#include "../../array/cuda/utils.h"
+#define BLOCKS(N, T) (N + T - 1) / T
+namespace dgl {
+namespace geometry {
+namespace impl {
+constexpr float BLUE_P = 0.53406;
+constexpr int BLUE = -1;
+constexpr int RED = -2;
+constexpr int EMPTY_IDX = -1;
+__device__ bool done_d;
+__global__ void init_done_kernel() { done_d = true; }
+template <typename IdType>
+__global__ void colorize_kernel(const float *prop, int64_t num_elem, IdType *result) {
+  const IdType idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < num_elem) {
+    if (result[idx] < 0) {  // if unmatched
+      result[idx] = (prop[idx] > BLUE_P) ? RED : BLUE;
+      done_d = false;
+    }
+  }
+}
+template <typename FloatType, typename IdType>
+__global__ void weighted_propose_kernel(const IdType *indptr, const IdType *indices,
+                                        const FloatType *weights, int64_t num_elem,
+                                        IdType *proposal, IdType *result) {
+  const IdType idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < num_elem) {
+    if (result[idx] != BLUE) return;
+    bool has_unmatched_neighbor = false;
+    FloatType weight_max = 0.;
+    IdType v_max = EMPTY_IDX;
+    for (IdType i = indptr[idx]; i < indptr[idx + 1]; ++i) {
+      auto v = indices[i];
+      if (result[v] < 0)
+        has_unmatched_neighbor = true;
+      if (result[v] == RED && weights[i] >= weight_max) {
+        v_max = v;
+        weight_max = weights[i];
+      }
+    }
+    proposal[idx] = v_max;
+    if (!has_unmatched_neighbor)
+      result[idx] = idx;
+  }
+}
+template <typename FloatType, typename IdType>
+__global__ void weighted_respond_kernel(const IdType *indptr, const IdType *indices,
+                                        const FloatType *weights, int64_t num_elem,
+                                        IdType *proposal, IdType *result) {
+  const IdType idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < num_elem) {
+    if (result[idx] != RED) return;
+    bool has_unmatched_neighbors = false;
+    IdType v_max = -1;
+    FloatType weight_max = 0.;
+    for (IdType i = indptr[idx]; i < indptr[idx + 1]; ++i) {
+      auto v = indices[i];
+      if (result[v] < 0) {
+        has_unmatched_neighbors = true;
+      }
+      if (result[v] == BLUE
+          && proposal[v] == idx
+          && weights[i] >= weight_max) {
+        v_max = v;
+        weight_max = weights[i];
+      }
+    }
+    if (v_max >= 0) {
+      result[v_max] = min(idx, v_max);
+      result[idx] = min(idx, v_max);
+    }
+    if (!has_unmatched_neighbors)
+      result[idx] = idx;
+  }
+}
+/*! \brief The colorize procedure. This procedure randomly marks unmarked
+ * nodes with BLUE(-1) and RED(-2) and checks whether the node matching
+ * process has finished.
+ */
+template<typename IdType>
+bool Colorize(IdType * result_data, curandGenerator_t gen, int64_t num_nodes) {
+  // initial done signal
+  auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
+  CUDA_KERNEL_CALL(init_done_kernel, 1, 1, 0, thr_entry->stream);
+  // generate color prop for each node
+  float *prop;
+  CUDA_CALL(cudaMalloc(reinterpret_cast<void **>(&prop), num_nodes * sizeof(float)));
+  CURAND_CALL(curandGenerateUniform(gen, prop, num_nodes));
+  cudaDeviceSynchronize();  // wait for random number generation finish since curand is async
+  // call kernel
+  auto num_threads = cuda::FindNumThreads(num_nodes);
+  auto num_blocks = cuda::FindNumBlocks<'x'>(BLOCKS(num_nodes, num_threads));
+  CUDA_KERNEL_CALL(colorize_kernel, num_blocks, num_threads, 0, thr_entry->stream,
+                   prop, num_nodes, result_data);
+  bool done_h = false;
+  CUDA_CALL(cudaMemcpyFromSymbol(&done_h, done_d, sizeof(done_h), 0, cudaMemcpyDeviceToHost));
+  CUDA_CALL(cudaFree(prop));
+  return done_h;
+}
+/*! \brief Weighted neighbor matching procedure (GPU version).
+ * This implementation is from `A GPU Algorithm for Greedy Graph Matching
+ * <http://www.staff.science.uu.nl/~bisse101/Articles/match12.pdf>`__
+ * 
+ * This algorithm has three parts: colorize, propose and respond.
+ * In colorize procedure, each unmarked node will be marked as BLUE or
+ * RED randomly. If all nodes are marked, finish and return.
+ * In propose procedure, each BLUE node will propose to the RED
+ * neighbor with the largest weight (or randomly choose one if without weight).
+ * If all its neighbors are marked, mark this node with its id.
+ * In respond procedure, each RED node will respond to the BLUE neighbor
+ * that has proposed to it and has the largest weight. If all neighbors
+ * are marked, mark this node with its id. Else match this (BLUE, RED) node
+ * pair and mark them with the smaller id between them.
+ */
+template <DLDeviceType XPU, typename FloatType, typename IdType>
+void WeightedNeighborMatching(const aten::CSRMatrix &csr, const NDArray weight, IdArray result) {
+  auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
+  if (!thr_entry->curand_gen) {
+    uint64_t seed = dgl::RandomEngine::ThreadLocal()->RandInt(UINT64_MAX);
+    CURAND_CALL(curandCreateGenerator(&thr_entry->curand_gen, CURAND_RNG_PSEUDO_DEFAULT));
+    CURAND_CALL(curandSetPseudoRandomGeneratorSeed(thr_entry->curand_gen, seed));
+  }
+  // create proposal tensor
+  const int64_t num_nodes = result->shape[0];
+  IdArray proposal = aten::Full(-1, num_nodes, sizeof(IdType) * 8, result->ctx);
+  // get data ptrs
+  IdType *indptr_data = static_cast<IdType*>(csr.indptr->data);
+  IdType *indices_data = static_cast<IdType*>(csr.indices->data);
+  IdType *result_data = static_cast<IdType*>(result->data);
+  IdType *proposal_data = static_cast<IdType*>(proposal->data);
+  FloatType *weight_data = static_cast<FloatType*>(weight->data);
+  auto num_threads = cuda::FindNumThreads(num_nodes);
+  auto num_blocks = cuda::FindNumBlocks<'x'>(BLOCKS(num_nodes, num_threads));
+  while (!Colorize<IdType>(result_data, thr_entry->curand_gen, num_nodes)) {
+    CUDA_KERNEL_CALL(weighted_propose_kernel, num_blocks, num_threads, 0, thr_entry->stream,
+                     indptr_data, indices_data, weight_data, num_nodes, proposal_data, result_data);
+    CUDA_KERNEL_CALL(weighted_respond_kernel, num_blocks, num_threads, 0, thr_entry->stream,
+                     indptr_data, indices_data, weight_data, num_nodes, proposal_data, result_data);
+  }
+}
+template void WeightedNeighborMatching<kDLGPU, float, int32_t>(
+  const aten::CSRMatrix &csr, const NDArray weight, IdArray result);
+template void WeightedNeighborMatching<kDLGPU, float, int64_t>(
+  const aten::CSRMatrix &csr, const NDArray weight, IdArray result);
+template void WeightedNeighborMatching<kDLGPU, double, int32_t>(
+  const aten::CSRMatrix &csr, const NDArray weight, IdArray result);
+template void WeightedNeighborMatching<kDLGPU, double, int64_t>(
+  const aten::CSRMatrix &csr, const NDArray weight, IdArray result);
+/*! \brief Unweighted neighbor matching procedure (GPU version).
+ * Instead of directly sample neighbors, we assign each neighbor
+ * with a random weight. We use random weight for 2 reasons:
+ *  1. Random sample for each node in GPU is expensive. Although
+ *     we can perform a global group-wise (neighborhood of each
+ *     node as a group) random permutation as in CPU version,
+ *     it still cost too much compared to directly using random weights.
+ *  2. Graph is sparse, thus neighborhood of each node is small,
+ *     which is suitable for GPU implementation.
+ */
+template <DLDeviceType XPU, typename IdType>
+void NeighborMatching(const aten::CSRMatrix &csr, IdArray result) {
+  const int64_t num_edges = csr.indices->shape[0];
+  // generate random weights
+  auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
+  if (!thr_entry->curand_gen) {
+    uint64_t seed = dgl::RandomEngine::ThreadLocal()->RandInt(UINT64_MAX);
+    CURAND_CALL(curandCreateGenerator(&thr_entry->curand_gen, CURAND_RNG_PSEUDO_DEFAULT));
+    CURAND_CALL(curandSetPseudoRandomGeneratorSeed(thr_entry->curand_gen, seed));
+  }
+  NDArray weight = NDArray::Empty(
+    {num_edges}, DLDataType{kDLFloat, sizeof(float) * 8, 1}, result->ctx);
+  float *weight_data = static_cast<float*>(weight->data);
+  CURAND_CALL(curandGenerateUniform(thr_entry->curand_gen, weight_data, num_edges));
+  cudaDeviceSynchronize();
+  WeightedNeighborMatching<XPU, float, IdType>(csr, weight, result);
+}
+template void NeighborMatching<kDLGPU, int32_t>(const aten::CSRMatrix &csr, IdArray result);
+template void NeighborMatching<kDLGPU, int64_t>(const aten::CSRMatrix &csr, IdArray result);
+}  // namespace impl
+}  // namespace geometry
+}  // namespace dgl
--- a/src/geometry/geometry.cc
+++ b/src/geometry/geometry.cc
@@ -5,8 +5,10 @@
 */
 #include <dgl/array.h>
 #include <dgl/runtime/ndarray.h>
+#include <dgl/base_heterograph.h>
 #include "../c_api_common.h"
 #include "./geometry_op.h"
+#include "../array/check.h"
 using namespace dgl::runtime;
@@ -31,6 +33,26 @@ void FarthestPointSampler(NDArray array, int64_t batch_size, int64_t sample_poin
  });
 }
+void NeighborMatching(HeteroGraphPtr graph, const NDArray weight, IdArray result) {
+  if (!aten::IsNullArray(weight)) {
+    ATEN_XPU_SWITCH_CUDA(graph->Context().device_type, XPU, "NeighborMatching", {
+      ATEN_FLOAT_TYPE_SWITCH(weight->dtype, FloatType, "weight", {
+        ATEN_ID_TYPE_SWITCH(graph->DataType(), IdType, {
+          impl::WeightedNeighborMatching<XPU, FloatType, IdType>(
+              graph->GetCSRMatrix(0), weight, result);
+        });
+      });
+    });
+  } else {
+    ATEN_XPU_SWITCH_CUDA(graph->Context().device_type, XPU, "NeighborMatching", {
+      ATEN_ID_TYPE_SWITCH(graph->DataType(), IdType, {
+        impl::NeighborMatching<XPU, IdType>(
+            graph->GetCSRMatrix(0), result);
+      });
+    });
+  }
+}
 ///////////////////////// C APIs /////////////////////////
 DGL_REGISTER_GLOBAL("geometry._CAPI_FarthestPointSampler")
@@ -45,5 +67,31 @@ DGL_REGISTER_GLOBAL("geometry._CAPI_FarthestPointSampler")
    FarthestPointSampler(data, batch_size, sample_points, dist, start_idx, result);
  });
+DGL_REGISTER_GLOBAL("geometry._CAPI_NeighborMatching")
+.set_body([] (DGLArgs args, DGLRetValue* rv) {
+    HeteroGraphRef graph = args[0];
+    const NDArray weight = args[1];
+    IdArray result = args[2];
+    // sanity check
+    aten::CheckCtx(graph->Context(), {weight, result}, {"edge_weight, result"});
+    aten::CheckContiguous({weight, result}, {"edge_weight", "result"});
+    CHECK_EQ(graph->NumEdgeTypes(), 1) << "homogeneous graph has only one edge type";
+    CHECK_EQ(result->ndim, 1) << "result should be an 1D tensor.";
+    auto pair = graph->meta_graph()->FindEdge(0);
+    const dgl_type_t node_type = pair.first;
+    CHECK_EQ(graph->NumVertices(node_type), result->shape[0])
+      << "The number of nodes should be the same as the length of result tensor.";
+    if (!aten::IsNullArray(weight)) {
+      CHECK_EQ(weight->ndim, 1) << "weight should be an 1D tensor.";
+      CHECK_EQ(graph->NumEdges(0), weight->shape[0])
+        << "number of edges in graph should be the same "
+        << "as the length of edge weight tensor.";
+    }
+    // call implementation
+    NeighborMatching(graph.sptr(), weight, result);
+  });
 }  // namespace geometry
 }  // namespace dgl
--- a/src/geometry/geometry_op.h
+++ b/src/geometry/geometry_op.h
@@ -16,6 +16,22 @@ template <DLDeviceType XPU, typename FloatType, typename IdType>
 void FarthestPointSampler(NDArray array, int64_t batch_size, int64_t sample_points,
    NDArray dist, IdArray start_idx, IdArray result);
+/*! \brief Implementation of weighted neighbor matching process of edge coarsening used
+ * in Metis and Graclus for homogeneous graph coarsening. This procedure keeps
+ * picking an unmarked vertex and matching it with one its unmarked neighbors
+ * (that maximizes its edge weight) until no match can be done.
+ */
+template <DLDeviceType XPU, typename FloatType, typename IdType>
+void WeightedNeighborMatching(const aten::CSRMatrix &csr, const NDArray weight, IdArray result);
+/*! \brief Implementation of neighbor matching process of edge coarsening used
+ * in Metis and Graclus for homogeneous graph coarsening. This procedure keeps
+ * picking an unmarked vertex and matching it with one its unmarked neighbors
+ * (that maximizes its edge weight) until no match can be done.
+ */
+template <DLDeviceType XPU, typename IdType>
+void NeighborMatching(const aten::CSRMatrix &csr, IdArray result);
 }  // namespace impl
 }  // namespace geometry
 }  // namespace dgl

--- a/src/kernel/binary_reduce.cc
+++ b/src/kernel/binary_reduce.cc
@@ -10,6 +10,7 @@
 #include "./binary_reduce_impl_decl.h"
 #include "./utils.h"
 #include "../c_api_common.h"
+#include "../array/check.h"
 #include "../graph/unit_graph.h"
 #include "./csr_interface.h"
@@ -175,20 +176,6 @@ std::string IdArrayToStr(IdArray arr) {
  return oss.str();
 }
-// Check whether the given arguments have the same context.
-inline void CheckCtx(
-    const DLContext& ctx,
-    const std::vector<NDArray>& arrays,
-    const std::vector<std::string>& names) {
-  for (size_t i = 0; i < arrays.size(); ++i) {
-    if (aten::IsNullArray(arrays[i]))
-      continue;
-    CHECK_EQ(ctx, arrays[i]->ctx)
-      << "Expected device context " << ctx << ". But got "
-      << arrays[i]->ctx << " for " << names[i] << ".";
-  }
-}
 // Check whether the given arguments use the same number of bits.
 inline void CheckIdArray(
    const uint8_t bits,
@@ -303,7 +290,7 @@ void BinaryOpReduce(
    NDArray out_mapping) {
  const auto& ctx = graph.Context();
  // sanity check
-  CheckCtx(ctx,
+  aten::CheckCtx(ctx,
      {lhs_data, rhs_data, out_data, lhs_mapping, rhs_mapping, out_mapping},
      {"lhs_data", "rhs_data", "out_data", "lhs_mapping", "rhs_mapping", "out_mapping"});
  CheckIdArray(graph.NumBits(),
@@ -392,7 +379,7 @@ void BackwardLhsBinaryOpReduce(
    NDArray grad_lhs_data) {
  const auto& ctx = graph.Context();
  // sanity check
-  CheckCtx(ctx,
+  aten::CheckCtx(ctx,
      {lhs_data, rhs_data, out_data, grad_out_data, grad_lhs_data,
       lhs_mapping, rhs_mapping, out_mapping},
      {"lhs_data", "rhs_data", "out_data", "grad_out_data", "grad_lhs_data",
@@ -469,7 +456,7 @@ void BackwardRhsBinaryOpReduce(
    NDArray grad_rhs_data) {
  const auto& ctx = graph.Context();
  // sanity check
-  CheckCtx(ctx,
+  aten::CheckCtx(ctx,
      {lhs_data, rhs_data, out_data, grad_out_data, grad_rhs_data,
       lhs_mapping, rhs_mapping, out_mapping},
      {"lhs_data", "rhs_data", "out_data", "grad_out_data", "grad_rhs_data",
@@ -539,7 +526,7 @@ void CopyReduce(
    NDArray in_mapping, NDArray out_mapping) {
  const auto& ctx = graph.Context();
  // sanity check
-  CheckCtx(ctx,
+  aten::CheckCtx(ctx,
      {in_data, out_data, in_mapping, out_mapping},
      {"in_data", "out_data", "in_mapping", "out_mapping"});
  CheckIdArray(graph.NumBits(),
@@ -582,7 +569,7 @@ void BackwardCopyReduce(
    NDArray grad_in_data) {
  const auto& ctx = graph.Context();
  // sanity check
-  CheckCtx(ctx,
+  aten::CheckCtx(ctx,
      {in_data, out_data, grad_out_data, grad_in_data, in_mapping, out_mapping},
      {"in_data", "out_data", "grad_out_data", "grad_in_data", "in_mapping", "out_mapping"});
  CheckIdArray(graph.NumBits(),

--- a/src/random/random.cc
+++ b/src/random/random.cc
@@ -10,6 +10,10 @@
 #include <dgl/random.h>
 #include <dgl/array.h>
+#ifdef DGL_USE_CUDA
+#include "../runtime/cuda/cuda_common.h"
+#endif  // DGL_USE_CUDA
 using namespace dgl::runtime;
 namespace dgl {
@@ -18,8 +22,18 @@ DGL_REGISTER_GLOBAL("rng._CAPI_SetSeed")
 .set_body([] (DGLArgs args, DGLRetValue *rv) {
    const int seed = args[0];
 #pragma omp parallel for
-    for (int i = 0; i < omp_get_max_threads(); ++i)
+    for (int i = 0; i < omp_get_max_threads(); ++i) {
      RandomEngine::ThreadLocal()->SetSeed(seed);
+#ifdef DGL_USE_CUDA
+      auto* thr_entry = CUDAThreadEntry::ThreadLocal();
+      if (!thr_entry->curand_gen) {
+        CURAND_CALL(curandCreateGenerator(&thr_entry->curand_gen, CURAND_RNG_PSEUDO_DEFAULT));
+      }
+      CURAND_CALL(curandSetPseudoRandomGeneratorSeed(
+          thr_entry->curand_gen,
+          static_cast<uint64_t>(seed + GetThreadId())));
+#endif  // DGL_USE_CUDA
+    }
  });
 DGL_REGISTER_GLOBAL("rng._CAPI_Choice")

--- a/src/runtime/cuda/cuda_common.h
+++ b/src/runtime/cuda/cuda_common.h
@@ -9,6 +9,7 @@
 #include <cublas_v2.h>
 #include <cusparse.h>
 #include <cuda_runtime.h>
+#include <curand.h>
 #include <dgl/runtime/packed_func.h>
 #include <string>
 #include "../workspace_pool.h"
@@ -70,6 +71,47 @@ inline bool is_zero<dim3>(dim3 size) {
    CHECK(e == CUBLAS_STATUS_SUCCESS) << "CUBLAS ERROR: " << e;    \
  }
+#define CURAND_CALL(func)                                           \
+{                                                                   \
+  curandStatus_t e = (func);                                        \
+  CHECK(e == CURAND_STATUS_SUCCESS)                                 \
+    << "CURAND Error: " << dgl::runtime::curandGetErrorString(e)    \
+    << " at " << __FILE__ << ":" << __LINE__;                       \
+}
+inline const char* curandGetErrorString(curandStatus_t error) {
+  switch (error) {
+  case CURAND_STATUS_SUCCESS:
+    return "CURAND_STATUS_SUCCESS";
+  case CURAND_STATUS_VERSION_MISMATCH:
+    return "CURAND_STATUS_VERSION_MISMATCH";
+  case CURAND_STATUS_NOT_INITIALIZED:
+    return "CURAND_STATUS_NOT_INITIALIZED";
+  case CURAND_STATUS_ALLOCATION_FAILED:
+    return "CURAND_STATUS_ALLOCATION_FAILED";
+  case CURAND_STATUS_TYPE_ERROR:
+    return "CURAND_STATUS_TYPE_ERROR";
+  case CURAND_STATUS_OUT_OF_RANGE:
+    return "CURAND_STATUS_OUT_OF_RANGE";
+  case CURAND_STATUS_LENGTH_NOT_MULTIPLE:
+    return "CURAND_STATUS_LENGTH_NOT_MULTIPLE";
+  case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED:
+    return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED";
+  case CURAND_STATUS_LAUNCH_FAILURE:
+    return "CURAND_STATUS_LAUNCH_FAILURE";
+  case CURAND_STATUS_PREEXISTING_FAILURE:
+    return "CURAND_STATUS_PREEXISTING_FAILURE";
+  case CURAND_STATUS_INITIALIZATION_FAILED:
+    return "CURAND_STATUS_INITIALIZATION_FAILED";
+  case CURAND_STATUS_ARCH_MISMATCH:
+    return "CURAND_STATUS_ARCH_MISMATCH";
+  case CURAND_STATUS_INTERNAL_ERROR:
+    return "CURAND_STATUS_INTERNAL_ERROR";
+  }
+  // To suppress compiler warning.
+  return "Unrecognized curand error string";
+}
 /*
 * \brief Cast data type to cudaDataType_t.
 */
@@ -122,6 +164,8 @@ class CUDAThreadEntry {
  cusparseHandle_t cusparse_handle{nullptr};
  /*! \brief The cublas handler */
  cublasHandle_t cublas_handle{nullptr};
+  /*! \brief The curand generator */
+  curandGenerator_t curand_gen{nullptr};
  /*! \brief thread local pool*/
  WorkspacePool pool;
  /*! \brief constructor */

--- a/tests/pytorch/test_geometry.py
+++ b/tests/pytorch/test_geometry.py
-import torch as th
-import dgl.nn
-from dgl.geometry.pytorch import FarthestPointSampler
 import backend as F
+import dgl.nn
+import dgl
 import numpy as np
+import pytest
+import torch as th
+from dgl.geometry.pytorch import FarthestPointSampler
+from dgl.geometry import neighbor_matching
+from test_utils import parametrize_dtype
+from test_utils.graph_cases import get_cases
 def test_fps():
    N = 1000
@@ -43,6 +49,44 @@ def test_knn():
    check_knn(g, x, 0, 3)
    check_knn(g, x, 3, 8)
+@parametrize_dtype
+@pytest.mark.parametrize('g', get_cases(['homo'], exclude=['dglgraph']))
+@pytest.mark.parametrize('weight', [True, False])
+@pytest.mark.parametrize('relabel', [True, False])
+def test_edge_coarsening(idtype, g, weight, relabel):
+    num_nodes = g.num_nodes()
+    g = dgl.to_bidirected(g)
+    g = g.astype(idtype).to(F.ctx())
+    edge_weight = None
+    if weight:
+        edge_weight = F.abs(F.randn((g.num_edges(),))).to(F.ctx())
+    node_labels = neighbor_matching(g, edge_weight, relabel_idx=relabel)
+    unique_ids, counts = th.unique(node_labels, return_counts=True)
+    num_result_ids = unique_ids.size(0)
+    # shape correct
+    assert node_labels.shape == (g.num_nodes(),)
+    # all nodes marked
+    assert F.reduce_sum(node_labels < 0).item() == 0
+    # number of unique node ids correct.
+    assert num_result_ids >= num_nodes // 2 and num_result_ids <= num_nodes
+    # each unique id has <= 2 nodes
+    assert F.reduce_sum(counts > 2).item() == 0
+    # if two nodes have the same id, they must be neighbors
+    idxs = F.arange(0, num_nodes, idtype)
+    for l in unique_ids:
+        l = l.item()
+        idx = idxs[(node_labels == l)]
+        if idx.size(0) == 2:
+            u, v = idx[0].item(), idx[1].item()
+            assert g.has_edges_between(u, v)
 if __name__ == '__main__':
    test_fps()
    test_knn()