[feature] Supporting half precision floating data type (fp16). (#2552)

* add tvm as submodule * compilation is ok but calling fails * can call now * pack multiple modules, change names * upd * upd * upd * fix cmake * upd * upd * upd * upd * fix * relative path * upd * upd * upd * singleton * upd * trigger * fix * upd * count reducible * upd * upd * upd * upd * upd * upd * upd * upd * upd * only keep related files * upd * upd * upd * upd * lint * lint * lint * lint * pylint * upd * upd * compilation * fix * upd * upd * upd * upd * upd * upd * upd doc * refactor * fix * upd number Co-authored-by: Zhi Lin <linzhilynn@gmail.com> Co-authored-by: Ubuntu <ubuntu@ip-172-31-42-78.us-east-2.compute.internal> Co-authored-by: Ubuntu <ubuntu@ip-172-31-21-156.us-east-2.compute.internal> Co-authored-by: Jinjing Zhou <VoVAllen@users.noreply.github.com>

[feature] Supporting half precision floating data type (fp16). (#2552)
* add tvm as submodule * compilation is ok but calling fails * can call now * pack multiple modules, change names * upd * upd * upd * fix cmake * upd * upd * upd * upd * fix * relative path * upd * upd * upd * singleton * upd * trigger * fix * upd * count reducible * upd * upd * upd * upd * upd * upd * upd * upd * upd * only keep related files * upd * upd * upd * upd * lint * lint * lint * lint * pylint * upd * upd * compilation * fix * upd * upd * upd * upd * upd * upd * upd doc * refactor * fix * upd number Co-authored-by: Zhi Lin <linzhilynn@gmail.com> Co-authored-by: Ubuntu <ubuntu@ip-172-31-42-78.us-east-2.compute.internal> Co-authored-by: Ubuntu <ubuntu@ip-172-31-21-156.us-east-2.compute.internal> Co-authored-by: Jinjing Zhou <VoVAllen@users.noreply.github.com>
7bab1365 · Zihao Ye · GitHub · a7e941c3 · 7bab1365 · 7bab1365
Unverified Commit 7bab1365 authored Jan 28, 2021 by Zihao Ye Committed by GitHub Jan 28, 2021
8 changed files
--- a/src/array/cuda/segment_reduce.cu
+++ b/src/array/cuda/segment_reduce.cu
@@ -6,6 +6,7 @@
 #include <dgl/array.h>
 #include "./segment_reduce.cuh"
 #include "./functor.cuh"
+#include "./utils.h"
 namespace dgl {
@@ -13,70 +14,97 @@ using namespace cuda;
 namespace aten {
-template <int XPU, typename IdType, typename DType>
+template <int XPU, typename IdType, int bits>
 void SegmentReduce(const std::string& op,
                   NDArray feat,
                   NDArray offsets,
                   NDArray out,
                   NDArray arg) {
-  if (op == "sum") {
+  SWITCH_BITS(bits, DType, {
-    cuda::SegmentReduce<IdType, DType, cuda::reduce::Sum<IdType, DType>>(
+    if (op == "sum") {
-        feat, offsets, out, arg);
+      cuda::SegmentReduce<IdType, DType, cuda::reduce::Sum<IdType, DType>>(
-  } else if (op == "max") {
+          feat, offsets, out, arg);
-    cuda::SegmentReduce<IdType, DType, cuda::reduce::Max<IdType, DType>>(
+    } else if (op == "max") {
-        feat, offsets, out, arg);
+      cuda::SegmentReduce<IdType, DType, cuda::reduce::Max<IdType, DType>>(
-  } else if (op == "min") {
+          feat, offsets, out, arg);
-    cuda::SegmentReduce<IdType, DType, cuda::reduce::Min<IdType, DType>>(
+    } else if (op == "min") {
-        feat, offsets, out, arg);
+      cuda::SegmentReduce<IdType, DType, cuda::reduce::Min<IdType, DType>>(
-  } else {
+          feat, offsets, out, arg);
-    LOG(FATAL) << "Not implemented";
+    } else {
-  }
+      LOG(FATAL) << "Not implemented";
+    }
+  });
 }
-template <int XPU, typename IdType, typename DType>
+template <int XPU, typename IdType, int bits>
 void BackwardSegmentCmp(NDArray feat,
                        NDArray arg,
                        NDArray out) {
-  cuda::BackwardSegmentCmp<IdType, DType>(feat, arg, out);
+  SWITCH_BITS(bits, DType, {
+    cuda::BackwardSegmentCmp<IdType, DType>(feat, arg, out);
+  });
 }
-template void SegmentReduce<kDLGPU, int32_t, float>(
+template void SegmentReduce<kDLGPU, int32_t, 16>(
    const std::string& op,
    NDArray feat,
    NDArray offsets,
    NDArray out,
    NDArray arg);
-template void SegmentReduce<kDLGPU, int64_t, float>(
+template void SegmentReduce<kDLGPU, int64_t, 16>(
    const std::string &op,
    NDArray feat,
    NDArray offsets,
    NDArray out,
    NDArray arg);
-template void SegmentReduce<kDLGPU, int32_t, double>(
+template void SegmentReduce<kDLGPU, int32_t, 32>(
+    const std::string& op,
+    NDArray feat,
+    NDArray offsets,
+    NDArray out,
+    NDArray arg);
+template void SegmentReduce<kDLGPU, int64_t, 32>(
    const std::string &op,
    NDArray feat,
    NDArray offsets,
    NDArray out,
    NDArray arg);
-template void SegmentReduce<kDLGPU, int64_t, double>(
+template void SegmentReduce<kDLGPU, int32_t, 64>(
    const std::string &op,
    NDArray feat,
    NDArray offsets,
    NDArray out,
    NDArray arg);
-template void BackwardSegmentCmp<kDLGPU, int32_t, float>(
+template void SegmentReduce<kDLGPU, int64_t, 64>(
+    const std::string &op,
+    NDArray feat,
+    NDArray offsets,
+    NDArray out,
+    NDArray arg);
+template void BackwardSegmentCmp<kDLGPU, int32_t, 16>(
+    NDArray feat,
+    NDArray arg,
+    NDArray out);
+template void BackwardSegmentCmp<kDLGPU, int64_t, 16>(
+    NDArray feat,
+    NDArray arg,
+    NDArray out);
+template void BackwardSegmentCmp<kDLGPU, int32_t, 32>(
    NDArray feat,
    NDArray arg,
    NDArray out);
-template void BackwardSegmentCmp<kDLGPU, int64_t, float>(
+template void BackwardSegmentCmp<kDLGPU, int64_t, 32>(
    NDArray feat,
    NDArray arg,
    NDArray out);
-template void BackwardSegmentCmp<kDLGPU, int32_t, double>(
+template void BackwardSegmentCmp<kDLGPU, int32_t, 64>(
    NDArray feat,
    NDArray arg,
    NDArray out);
-template void BackwardSegmentCmp<kDLGPU, int64_t, double>(
+template void BackwardSegmentCmp<kDLGPU, int64_t, 64>(
    NDArray feat,
    NDArray arg,
    NDArray out);

--- a/src/array/cuda/segment_reduce.cuh
+++ b/src/array/cuda/segment_reduce.cuh
@@ -7,7 +7,6 @@
 #define DGL_ARRAY_SEGMENT_REDUCE_CUH_
 #include "../../runtime/cuda/cuda_common.h"
-#include "./atomic.cuh"
 #include "./utils.h"
 namespace dgl {
@@ -31,7 +30,7 @@ __global__ void SegmentReduceKernel(
  int row = blockIdx.x;
  int col = blockIdx.y * blockDim.x + threadIdx.x;
  if (col < dim) {
-    DType local_accum = ReduceOp::zero;
+    DType local_accum = ReduceOp::zero();
    IdType local_arg = -1;
    for (IdType i = offsets[row]; i < offsets[row + 1]; ++i) {
      ReduceOp::Call(&local_accum, &local_arg, feat[i * dim + col], i);

--- a/src/array/cuda/spmm.cu
+++ b/src/array/cuda/spmm.cu
@@ -14,6 +14,140 @@ namespace dgl {
 using namespace cuda;
 namespace aten {
+namespace {
+/*! \brief Call cuBLAS geam API for transpose operation for float and double. */
+template <typename DType>
+cublasStatus_t Xgeam(cublasHandle_t handle, cublasOperation_t transa,
+    cublasOperation_t transb, int m, int n,
+    const DType* alpha, const DType* A, int lda,
+    const DType* beta, const DType* B, int ldb,
+    DType* C, int ldc) {
+  LOG(INFO) << "Not supported dtype";
+  return CUBLAS_STATUS_EXECUTION_FAILED;
+}
+template <>
+cublasStatus_t Xgeam<float>(cublasHandle_t handle, cublasOperation_t transa,
+    cublasOperation_t transb, int m, int n,
+    const float* alpha, const float* A, int lda,
+    const float* beta, const float* B, int ldb,
+    float* C, int ldc) {
+  return cublasSgeam(handle, transa, transb, m, n, alpha, A, lda,
+      beta, B, ldb, C, ldc);
+}
+template <>
+cublasStatus_t Xgeam<double>(cublasHandle_t handle, cublasOperation_t transa,
+    cublasOperation_t transb, int m, int n,
+    const double* alpha, const double* A, int lda,
+    const double* beta, const double* B, int ldb,
+    double* C, int ldc) {
+  return cublasDgeam(handle, transa, transb, m, n, alpha, A, lda,
+      beta, B, ldb, C, ldc);
+}
+/* \brief IndexSelect operator kernel implementation.
+ * \note duplicate of IndexSelectKernel defined in array_index_select.cu
+ */
+template <typename DType, typename IdType>
+__global__ void _IndexSelectKernel(
+    const DType* __restrict__ in,
+    const IdType* __restrict__ idx,
+    DType* __restrict__ out,
+    int n, int m) {
+  int i = blockIdx.x;
+  for (int j = threadIdx.x; j < m; j += blockDim.x)
+    out[i * m + j] = in[idx[i] * m + j];
+}
+/* \brief Transpose operator kernel implementation.
+ * \note not efficient but it's not a bottleneck, used for float16 dtype.
+ */
+template <typename DType>
+__global__ void _TransposeKernel(
+    const DType* __restrict__ in,
+    DType* __restrict__ out,
+    int n, int m) {
+  int i = blockIdx.x;
+  for (int j = threadIdx.x; j < m; j += blockDim.x)
+    out[i * m + j] = in[j * n + i];
+}
+/*
+ * \brief Tranpose the input matrix.
+ * \param row number of rows of input matrix.
+ * \param col number of columns of input matrix.
+ */
+template <typename DType>
+void _Transpose(const DType* in, DType* out,
+                int row, int col) {
+  DType alpha = 1., beta = 0.;
+  auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
+  if (!thr_entry->cublas_handle)
+    CUBLAS_CALL(cublasCreate(&(thr_entry->cublas_handle)));
+  CUBLAS_CALL(cublasSetStream(thr_entry->cublas_handle, thr_entry->stream));
+  CUBLAS_CALL(Xgeam<DType>(
+      thr_entry->cublas_handle,
+      CUBLAS_OP_T,
+      CUBLAS_OP_N,
+      row, col,
+      &alpha, in, col,
+      &beta, nullptr, row,
+      out, row));
+}
+/*
+ * \brief Tranpose the input matrix for data type half.
+ * \note cuBLAS has no geam API for half data type, fallback to our kernel.
+ */
+template <>
+void _Transpose<half>(const half* in, half* out,
+                      int row, int col) {
+  auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
+  int nt = FindNumThreads(row);
+  int nb = col;
+  CUDA_KERNEL_CALL(_TransposeKernel, nb, nt, 0, thr_entry->stream, in, out, col, row);
+}
+/*
+ * \brief
+ */
+template <typename DType, typename IdType>
+__global__ void _IndexSelectKernel(const DType* array, const IdType* index,
+                                   int64_t length, DType* out) {
+  int tx = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride_x = gridDim.x * blockDim.x;
+  while (tx < length) {
+    out[tx] = array[index[tx]];
+    tx += stride_x;
+  }
+}
+/* \brief IndexSelect operator.
+ * \note duplicate of IndexSelect defined in array_op.h but it can
+ *    not be applied to float16 dtype.
+ */
+template<typename DType, typename IdType>
+NDArray _IndexSelect(NDArray array, NDArray index) {
+  auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
+  const DType* array_data = static_cast<DType*>(array->data);
+  const IdType* idx_data = static_cast<IdType*>(index->data);
+  const int64_t arr_len = array->shape[0];
+  const int64_t len = index->shape[0];
+  NDArray ret = NDArray::Empty({len}, array->dtype, array->ctx);
+  if (len == 0)
+    return ret;
+  DType* ret_data = static_cast<DType*>(ret->data);
+  const int nt = FindNumThreads(len);
+  const int nb = (len + nt - 1) / nt;
+  CUDA_KERNEL_CALL(_IndexSelectKernel, nb, nt, 0, thr_entry->stream,
+      array_data, idx_data, len, ret_data);
+  return ret;
+}
+}  // namespace
 namespace cusparse {
 #if CUDART_VERSION < 11000
@@ -50,38 +184,8 @@ cusparseStatus_t Xcsrmm2<double>(cusparseHandle_t handle, cusparseOperation_t tr
 }
 #endif
-template <typename DType>
-cublasStatus_t Xgeam(cublasHandle_t handle, cublasOperation_t transa,
-    cublasOperation_t transb, int m, int n,
-    const DType* alpha, const DType* A, int lda,
-    const DType* beta, const DType* B, int ldb,
-    DType* C, int ldc) {
-  LOG(INFO) << "Not supported dtype";
-  return CUBLAS_STATUS_EXECUTION_FAILED;
-}
-template <>
-cublasStatus_t Xgeam<float>(cublasHandle_t handle, cublasOperation_t transa,
-    cublasOperation_t transb, int m, int n,
-    const float* alpha, const float* A, int lda,
-    const float* beta, const float* B, int ldb,
-    float* C, int ldc) {
-  return cublasSgeam(handle, transa, transb, m, n, alpha, A, lda,
-      beta, B, ldb, C, ldc);
-}
-template <>
-cublasStatus_t Xgeam<double>(cublasHandle_t handle, cublasOperation_t transa,
-    cublasOperation_t transb, int m, int n,
-    const double* alpha, const double* A, int lda,
-    const double* beta, const double* B, int ldb,
-    double* C, int ldc) {
-  return cublasDgeam(handle, transa, transb, m, n, alpha, A, lda,
-      beta, B, ldb, C, ldc);
-}
 /*! Cusparse implementation of SpMM on Csr format. */
-template <typename DType>
+template <typename DType, typename IdType>
 void CusparseCsrmm2(
    const DLContext& ctx,
    const CSRMatrix& csr,
@@ -118,20 +222,21 @@ void CusparseCsrmm2(
 #if CUDART_VERSION >= 11000
  cusparseSpMatDescr_t matA;
  cusparseDnMatDescr_t matB, matC;
-  constexpr auto cuda_dtype = std::is_same<DType, float>::value ? CUDA_R_32F: CUDA_R_64F;
+  constexpr auto dtype = cuda_dtype<DType>::value;
+  constexpr auto idtype = cusparse_idtype<IdType>::value;
  CUSPARSE_CALL(cusparseCreateCsr(&matA,
      m, k, nnz,
-      static_cast<int32_t*>(csr.indptr->data),
+      static_cast<IdType*>(csr.indptr->data),
-      static_cast<int32_t*>(csr.indices->data),
+      static_cast<IdType*>(csr.indices->data),
      const_cast<DType*>(valptr? valptr : A_data),
-      CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
+      idtype, idtype,
-      CUSPARSE_INDEX_BASE_ZERO, cuda_dtype));
+      CUSPARSE_INDEX_BASE_ZERO, dtype));
  CUSPARSE_CALL(cusparseCreateDnMat(&matB,
      k, n, n,
-      const_cast<DType*>(B_data), cuda_dtype, CUSPARSE_ORDER_ROW));
+      const_cast<DType*>(B_data), dtype, CUSPARSE_ORDER_ROW));
  CUSPARSE_CALL(cusparseCreateDnMat(&matC,
      m, n, n,
-      C_data, cuda_dtype, CUSPARSE_ORDER_ROW));
+      C_data, dtype, CUSPARSE_ORDER_ROW));
  auto transA = CUSPARSE_OPERATION_NON_TRANSPOSE;
  auto transB = CUSPARSE_OPERATION_NON_TRANSPOSE;
@@ -139,13 +244,13 @@ void CusparseCsrmm2(
  CUSPARSE_CALL(cusparseSpMM_bufferSize(
      thr_entry->cusparse_handle, transA, transB,
      &alpha, matA, matB, &beta, matC,
-      cuda_dtype, CUSPARSE_SPMM_CSR_ALG2,
+      dtype, CUSPARSE_SPMM_CSR_ALG2,
      &workspace_size));
  void* workspace = device->AllocWorkspace(ctx, workspace_size);
  CUSPARSE_CALL(cusparseSpMM(
      thr_entry->cusparse_handle, transA, transB,
      &alpha, matA, matB, &beta, matC,
-      cuda_dtype, CUSPARSE_SPMM_CSR_ALG2,
+      dtype, CUSPARSE_SPMM_CSR_ALG2,
      workspace));
  device->FreeWorkspace(ctx, workspace);
@@ -171,17 +276,7 @@ void CusparseCsrmm2(
      B_data, n, &beta, trans_out, m));
  CUSPARSE_CALL(cusparseDestroyMatDescr(descr));
  // transpose the output matrix
-  if (!thr_entry->cublas_handle)
+  _Transpose(trans_out, C_data, n, m);
-    CUBLAS_CALL(cublasCreate(&(thr_entry->cublas_handle)));
-  CUBLAS_CALL(cublasSetStream(thr_entry->cublas_handle, thr_entry->stream));
-  CUBLAS_CALL(Xgeam<DType>(
-      thr_entry->cublas_handle,
-      CUBLAS_OP_T,
-      CUBLAS_OP_N,
-      n, m,
-      &alpha, trans_out, m,
-      &beta, nullptr, n,
-      C_data, n));
  device->FreeWorkspace(ctx, trans_out);
 #endif
  if (valptr)
@@ -214,12 +309,29 @@ void CusparseCsrmm2(
    }                                                               \
  } while (0)
+/*!
+ * \brief Determine whether cusparse SpMM function is applicable.
+ */
+template <int bits, typename IdType>
+inline bool cusparse_available() {
+#if CUDART_VERSION < 11000
+  if (std::is_same<IdType, int>::value)
+    if (bits > 16)
+      return true;
+  return false;
+#else
+  if (bits == 16)
+    return false;  // cusparse's SpMM on fp16 is slow, temporally disabled.
+  return true;
+#endif
+}
 /*!
 * \brief CUDA implementation of g-SpMM on Csr format.
 * \note use cusparse if the reduce operator is `sum` and there is
 *       no broadcast, use dgl's kernel in other cases.
 */
-template <int XPU, typename IdType, typename DType>
+template <int XPU, typename IdType, int bits>
 void SpMMCsr(const std::string& op, const std::string& reduce,
             const BcastOff& bcast,
             const CSRMatrix& csr,
@@ -232,53 +344,67 @@ void SpMMCsr(const std::string& op, const std::string& reduce,
  bool use_efeat = op != "copy_lhs";
  if (reduce == "sum") {
-    if (sizeof(IdType) == 4 && op == "copy_lhs") {  // cusparse
+    if (op == "copy_lhs" && cusparse_available<bits, IdType>()) {  // cusparse
      int64_t x_length = 1;
      for (int i = 1; i < ufeat->ndim; ++i)
        x_length *= ufeat->shape[i];
-      cusparse::CusparseCsrmm2<DType>(
+      SWITCH_BITS(bits, DType, {
-          ufeat->ctx, csr,
+        cusparse::CusparseCsrmm2<DType, IdType>(
-          static_cast<DType*>(ufeat->data),
+            ufeat->ctx, csr,
-          nullptr,
+            static_cast<DType*>(ufeat->data),
-          static_cast<DType*>(out->data),
+            nullptr,
-          x_length);
+            static_cast<DType*>(out->data),
-    } else if (sizeof(IdType) == 4 && op == "mul" && is_scalar_efeat) {  // cusparse
+            x_length);
+      });
+    } else if (op == "mul" && is_scalar_efeat && cusparse_available<bits, IdType>()) {  // cusparse
      int64_t x_length = 1;
      for (int i = 1; i < ufeat->ndim; ++i)
        x_length *= ufeat->shape[i];
-      if (!IsNullArray(csr.data))
+      if (!IsNullArray(csr.data)) {
-        efeat = IndexSelect(efeat, csr.data);
+        SWITCH_BITS(bits, DType, {
-      cusparse::CusparseCsrmm2<DType>(
+          efeat = _IndexSelect<DType, IdType>(efeat, csr.data);
-          ufeat->ctx, csr,
+        });
-          static_cast<DType*>(ufeat->data),
+      }
-          static_cast<DType*>(efeat->data),
+      SWITCH_BITS(bits, DType, {
-          static_cast<DType*>(out->data),
+        cusparse::CusparseCsrmm2<DType, IdType>(
-          x_length);
+            ufeat->ctx, csr,
+            static_cast<DType*>(ufeat->data),
+            static_cast<DType*>(efeat->data),
+            static_cast<DType*>(out->data),
+            x_length);
+      });
    } else {  // general kernel
-      SWITCH_OP(op, Op, {
+      SWITCH_BITS(bits, DType, {
-        cuda::SpMMCsr<IdType, DType, Op, cuda::reduce::Sum<IdType, DType> >(
+        SWITCH_OP(op, Op, {
-            bcast, csr, ufeat, efeat, out, NullArray(), NullArray());
+          cuda::SpMMCsr<IdType, DType, Op, cuda::reduce::Sum<IdType, DType> >(
+              bcast, csr, ufeat, efeat, out, NullArray(), NullArray());
+        });
      });
    }
  } else if (reduce == "max") {
-    SWITCH_OP(op, Op, {
+    SWITCH_BITS(bits, DType, {
-      cuda::SpMMCsr<IdType, DType, Op, cuda::reduce::Max<IdType, DType> >(
+      SWITCH_OP(op, Op, {
-          bcast, csr, ufeat, efeat, out, out_aux[0], out_aux[1]);
+        cuda::SpMMCsr<IdType, DType, Op, cuda::reduce::Max<IdType, DType> >(
+            bcast, csr, ufeat, efeat, out, out_aux[0], out_aux[1]);
+      });
    });
  } else if (reduce == "min") {
-    SWITCH_OP(op, Op, {
+    SWITCH_BITS(bits, DType, {
-      cuda::SpMMCsr<IdType, DType, Op, cuda::reduce::Min<IdType, DType> >(
+      SWITCH_OP(op, Op, {
-          bcast, csr, ufeat, efeat, out, out_aux[0], out_aux[1]);
+        cuda::SpMMCsr<IdType, DType, Op, cuda::reduce::Min<IdType, DType> >(
+            bcast, csr, ufeat, efeat, out, out_aux[0], out_aux[1]);
+      });
    });
  } else {
    LOG(FATAL) << "Not implemented";
  }
 }
 /*!
 * \brief CUDA implementation of g-SpMM on Coo format.
 */
-template <int XPU, typename IdType, typename DType>
+template <int XPU, typename IdType, int bits>
 void SpMMCoo(const std::string& op, const std::string& reduce,
             const BcastOff& bcast,
             const COOMatrix& coo,
@@ -287,58 +413,81 @@ void SpMMCoo(const std::string& op, const std::string& reduce,
             NDArray out,
             std::vector<NDArray> out_aux) {
  if (reduce == "sum") {
-    SWITCH_OP(op, Op, {
+    SWITCH_BITS(bits, DType, {
-      cuda::SpMMCoo<IdType, DType, Op, cuda::reduce::Sum<IdType, DType, true> > (
+      SWITCH_OP(op, Op, {
-          bcast, coo, ufeat, efeat, out, NullArray(), NullArray());
+        cuda::SpMMCoo<IdType, DType, Op, cuda::reduce::Sum<IdType, DType, true> > (
+            bcast, coo, ufeat, efeat, out, NullArray(), NullArray());
+      });
    });
  } else if (reduce == "max") {
-    SWITCH_OP(op, Op, {
+    SWITCH_BITS(bits, DType, {
-      cuda::SpMMCoo<IdType, DType, Op, cuda::reduce::Max<IdType, DType, true> > (
+      SWITCH_OP(op, Op, {
-          bcast, coo, ufeat, efeat, out, out_aux[0], out_aux[1]);
+        cuda::SpMMCoo<IdType, DType, Op, cuda::reduce::Max<IdType, DType, true> > (
+            bcast, coo, ufeat, efeat, out, out_aux[0], out_aux[1]);
+      });
    });
  }  else if (reduce == "min") {
-    SWITCH_OP(op, Op, {
+    SWITCH_BITS(bits, DType, {
-      cuda::SpMMCoo<IdType, DType, Op, cuda::reduce::Min<IdType, DType, true> > (
+      SWITCH_OP(op, Op, {
-          bcast, coo, ufeat, efeat, out, out_aux[0], out_aux[1]);
+        cuda::SpMMCoo<IdType, DType, Op, cuda::reduce::Min<IdType, DType, true> > (
+            bcast, coo, ufeat, efeat, out, out_aux[0], out_aux[1]);
+      });
    });
  } else {
    LOG(FATAL) << "Not implemented";
  }
 }
-template void SpMMCsr<kDLGPU, int32_t, float>(
+template void SpMMCsr<kDLGPU, int32_t, 16>(
    const std::string& op, const std::string& reduce,
    const BcastOff& bcast, const CSRMatrix& csr,
    NDArray ufeat, NDArray efeat, NDArray out, std::vector<NDArray> out_aux);
-template void SpMMCsr<kDLGPU, int64_t, float>(
+template void SpMMCsr<kDLGPU, int64_t, 16>(
    const std::string& op, const std::string& reduce,
    const BcastOff& bcast, const CSRMatrix& csr,
    NDArray ufeat, NDArray efeat, NDArray out, std::vector<NDArray> out_aux);
-template void SpMMCsr<kDLGPU, int32_t, double>(
+template void SpMMCsr<kDLGPU, int32_t, 32>(
    const std::string& op, const std::string& reduce,
    const BcastOff& bcast, const CSRMatrix& csr,
    NDArray ufeat, NDArray efeat, NDArray out, std::vector<NDArray> out_aux);
-template void SpMMCsr<kDLGPU, int64_t, double>(
+template void SpMMCsr<kDLGPU, int64_t, 32>(
+    const std::string& op, const std::string& reduce,
+    const BcastOff& bcast, const CSRMatrix& csr,
+    NDArray ufeat, NDArray efeat, NDArray out, std::vector<NDArray> out_aux);
+template void SpMMCsr<kDLGPU, int32_t, 64>(
+    const std::string& op, const std::string& reduce,
+    const BcastOff& bcast, const CSRMatrix& csr,
+    NDArray ufeat, NDArray efeat, NDArray out, std::vector<NDArray> out_aux);
+template void SpMMCsr<kDLGPU, int64_t, 64>(
    const std::string& op, const std::string& reduce,
    const BcastOff& bcast, const CSRMatrix& csr,
    NDArray ufeat, NDArray efeat, NDArray out, std::vector<NDArray> out_aux);
-template void SpMMCoo<kDLGPU, int32_t, float>(
+template void SpMMCoo<kDLGPU, int32_t, 16>(
+    const std::string& op, const std::string& reduce,
+    const BcastOff& bcast, const COOMatrix& coo,
+    NDArray ufeat, NDArray efeat, NDArray out, std::vector<NDArray> out_aux);
+template void SpMMCoo<kDLGPU, int64_t, 16>(
    const std::string& op, const std::string& reduce,
    const BcastOff& bcast, const COOMatrix& coo,
    NDArray ufeat, NDArray efeat, NDArray out, std::vector<NDArray> out_aux);
-template void SpMMCoo<kDLGPU, int64_t, float>(
+template void SpMMCoo<kDLGPU, int32_t, 32>(
    const std::string& op, const std::string& reduce,
    const BcastOff& bcast, const COOMatrix& coo,
    NDArray ufeat, NDArray efeat, NDArray out, std::vector<NDArray> out_aux);
-template void SpMMCoo<kDLGPU, int32_t, double>(
+template void SpMMCoo<kDLGPU, int64_t, 32>(
    const std::string& op, const std::string& reduce,
    const BcastOff& bcast, const COOMatrix& coo,
    NDArray ufeat, NDArray efeat, NDArray out, std::vector<NDArray> out_aux);
-template void SpMMCoo<kDLGPU, int64_t, double>(
+template void SpMMCoo<kDLGPU, int32_t, 64>(
    const std::string& op, const std::string& reduce,
    const BcastOff& bcast, const COOMatrix& coo,
    NDArray ufeat, NDArray efeat, NDArray out, std::vector<NDArray> out_aux);
+template void SpMMCoo<kDLGPU, int64_t, 64>(
+    const std::string& op, const std::string& reduce,
+    const BcastOff& bcast, const COOMatrix& coo,
+    NDArray ufeat, NDArray efeat, NDArray out, std::vector<NDArray> out_aux);
 }  // namespace aten
 }  // namespace dgl
--- a/src/array/cuda/spmm.cuh
+++ b/src/array/cuda/spmm.cuh
@@ -8,6 +8,7 @@
 #include <dgl/bcast.h>
 #include "macro.cuh"
+#include "fp16.cuh"
 #include "atomic.cuh"
 #include "../../runtime/cuda/cuda_common.h"
 #include "./utils.h"
@@ -147,7 +148,7 @@ __global__ void SpMMCsrKernel(
  while (ty < num_rows) {
    int tx = blockIdx.x * blockDim.x + threadIdx.x;
    while (tx < out_len) {
-      DType local_accum = ReduceOp::zero;
+      DType local_accum = ReduceOp::zero();
      Idx local_argu = 0, local_arge = 0;
      const int lhs_add = UseBcast ? ubcast_off[tx] : tx;
      const int rhs_add = UseBcast ? ebcast_off[tx] : tx;
@@ -191,6 +192,12 @@ void SpMMCoo(
    const COOMatrix& coo,
    NDArray ufeat, NDArray efeat,
    NDArray out, NDArray argu, NDArray arge) {
+#if defined(CUDART_VERSION) && CUDART_VERSION <= 10000
+  if (std::is_same<DType, half>::value)
+    LOG(FATAL) << "SpMMCoo requires atomicCAS, which is not supported "
+               << "for float16 in CUDA 10.0. Please upgrade your CUDA "
+               << "to later versions.";
+#endif
  const Idx *row = coo.row.Ptr<Idx>(),
            *col = coo.col.Ptr<Idx>(),
            *edge_map = coo.data.Ptr<Idx>();
@@ -211,7 +218,7 @@ void SpMMCoo(
  const int nt = FindNumThreads(out_size);
  const int nb = (out_size + nt - 1) / nt;
  CUDA_KERNEL_CALL(_FillKernel, nb, nt, 0, thr_entry->stream,
-      out_data, out_size, ReduceOp::zero);
+      out_data, out_size, ReduceOp::zero());
  const int ntx = FindNumThreads(len);
  const int nty = CUDA_MAX_NUM_THREADS / ntx;

--- a/src/array/cuda/utils.h
+++ b/src/array/cuda/utils.h
@@ -18,6 +18,37 @@ namespace cuda {
 #define CUDA_MAX_NUM_BLOCKS_Z 0xFFFF
 #define CUDA_MAX_NUM_THREADS 1024
+#ifdef USE_FP16
+#define SWITCH_BITS(bits, DType, ...)                           \
+  do {                                                          \
+    if ((bits) == 16) {                                         \
+      typedef half DType;                                       \
+      { __VA_ARGS__ }                                           \
+    } else if ((bits) == 32) {                                  \
+      typedef float DType;                                      \
+      { __VA_ARGS__ }                                           \
+    } else if ((bits) == 64) {                                  \
+      typedef double DType;                                     \
+      { __VA_ARGS__ }                                           \
+    } else {                                                    \
+      LOG(FATAL) << "Data type not renogized with bits " << bits; \
+    }                                                           \
+  } while (0)
+#else  // USE_FP16
+#define SWITCH_BITS(bits, DType, ...)                           \
+  do {                                                          \
+    if ((bits) == 32) {                                         \
+      typedef float DType;                                      \
+      { __VA_ARGS__ }                                           \
+    } else if ((bits) == 64) {                                  \
+      typedef double DType;                                     \
+      { __VA_ARGS__ }                                           \
+    } else {                                                    \
+      LOG(FATAL) << "Data type not renogized with bits " << bits; \
+    }                                                           \
+  } while (0)
+#endif  // USE_FP16
 /*! \brief Calculate the number of threads needed given the dimension length.
 *
 * It finds the biggest number that is smaller than min(dim, max_nthrs)

--- a/src/array/kernel.cc
+++ b/src/array/kernel.cc
@@ -81,13 +81,13 @@ void SpMM(const std::string& op, const std::string& reduce,
  ATEN_XPU_SWITCH_CUDA(graph->Context().device_type, XPU, "SpMM", {
    ATEN_ID_TYPE_SWITCH(graph->DataType(), IdType, {
-      ATEN_FLOAT_TYPE_SWITCH(out->dtype, DType, "Feature data", {
+      ATEN_FLOAT_BITS_SWITCH(out->dtype, bits, "Feature data", {
        if (format == SparseFormat::kCSC) {
-          SpMMCsr<XPU, IdType, DType>(
+          SpMMCsr<XPU, IdType, bits>(
              op, reduce, bcast, graph->GetCSCMatrix(0),
              ufeat, efeat, out, out_aux);
        } else if (format == SparseFormat::kCOO) {
-          SpMMCoo<XPU, IdType, DType>(
+          SpMMCoo<XPU, IdType, bits>(
              op, reduce, bcast, graph->GetCOOMatrix(0),
              ufeat, efeat, out, out_aux);
        } else {
@@ -112,13 +112,13 @@ void SDDMM(const std::string& op,
  ATEN_XPU_SWITCH_CUDA(graph->Context().device_type, XPU, "SDDMM", {
    ATEN_ID_TYPE_SWITCH(graph->DataType(), IdType, {
-      ATEN_FLOAT_TYPE_SWITCH(out->dtype, DType, "Feature data", {
+      ATEN_FLOAT_BITS_SWITCH(out->dtype, bits, "Feature data", {
        if (format == SparseFormat::kCSR) {
-          SDDMMCsr<XPU, IdType, DType>(
+          SDDMMCsr<XPU, IdType, bits>(
              op, bcast, graph->GetCSRMatrix(0),
              lhs, rhs, out, lhs_target, rhs_target);
        } else if (format == SparseFormat::kCOO) {
-          SDDMMCoo<XPU, IdType, DType>(
+          SDDMMCoo<XPU, IdType, bits>(
              op, bcast, graph->GetCOOMatrix(0),
              lhs, rhs, out, lhs_target, rhs_target);
        } else {
@@ -129,6 +129,15 @@ void SDDMM(const std::string& op,
  });
 }
+NDArray GetEdgeMapping(HeteroGraphRef graph) {
+  SparseFormat format = graph->SelectFormat(0, csc_code);
+  if (format == SparseFormat::kCSC) {
+    return graph.sptr()->GetCSCMatrix(0).data;
+  } else {
+    return NullArray();
+  }
+}
 /*! \brief Segment reduce dispatch function. */
 void SegmentReduceDispatch(const std::string& op,
                           NDArray feat,
@@ -137,8 +146,8 @@ void SegmentReduceDispatch(const std::string& op,
                           NDArray arg) {
  ATEN_XPU_SWITCH_CUDA(feat->ctx.device_type, XPU, "SegmentReduce", {
    ATEN_ID_TYPE_SWITCH(offsets->dtype, IdType, {
-      ATEN_FLOAT_TYPE_SWITCH(feat->dtype, DType, "Feature data", {
+      ATEN_FLOAT_BITS_SWITCH(feat->dtype, bits, "Feature data", {
-          SegmentReduce<XPU, IdType, DType>(op, feat, offsets, out, arg);
+          SegmentReduce<XPU, IdType, bits>(op, feat, offsets, out, arg);
      });
    });
  });
@@ -148,8 +157,8 @@ void SegmentReduceDispatch(const std::string& op,
 void BackwardSegmentCmpDispatch(NDArray feat, NDArray arg, NDArray out) {
  ATEN_XPU_SWITCH_CUDA(feat->ctx.device_type, XPU, "BackwardSegmentCmp", {
    ATEN_ID_TYPE_SWITCH(arg->dtype, IdType, {
-      ATEN_FLOAT_TYPE_SWITCH(feat->dtype, DType, "Feature data", {
+      ATEN_FLOAT_BITS_SWITCH(feat->dtype, bits, "Feature data", {
-        BackwardSegmentCmp<XPU, IdType, DType>(feat, arg, out);
+        BackwardSegmentCmp<XPU, IdType, bits>(feat, arg, out);
      });
    });
  });
@@ -226,6 +235,12 @@ DGL_REGISTER_GLOBAL("sparse._CAPI_DGLKernelBwdSegmentCmp")
    BackwardSegmentCmpDispatch(feat, arg, out);
  });
+DGL_REGISTER_GLOBAL("sparse._CAPI_DGLKernelGetEdgeMapping")
+.set_body([](DGLArgs args, DGLRetValue *rv) {
+    HeteroGraphRef graph = args[0];
+    *rv = GetEdgeMapping(graph);
+  });
 #ifdef USE_TVM
 DGL_REGISTER_GLOBAL("sparse._CAPI_FG_LoadModule")
 .set_body([] (DGLArgs args, DGLRetValue* rv) {

--- a/src/array/kernel_decl.h
+++ b/src/array/kernel_decl.h
@@ -19,7 +19,7 @@ namespace aten {
 /*!
 * \brief Generalized Sparse Matrix Dense Matrix Multiplication on Csr format.
 */
-template <int XPU, typename IdType, typename DType>
+template <int XPU, typename IdType, int bits>
 void SpMMCsr(const std::string& op, const std::string& reduce,
             const BcastOff& bcast,
             const aten::CSRMatrix& csr,
@@ -31,7 +31,7 @@ void SpMMCsr(const std::string& op, const std::string& reduce,
 /*!
 * \brief Generalized Sparse Matrix Dense Matrix Multiplication on Coo format.
 */
-template <int XPU, typename IdType, typename DType>
+template <int XPU, typename IdType, int bits>
 void SpMMCoo(const std::string& op, const std::string& reduce,
             const BcastOff& bcast,
             const aten::COOMatrix& coo,
@@ -43,7 +43,7 @@ void SpMMCoo(const std::string& op, const std::string& reduce,
 /*!
 * \brief Generalized Sampled Dense-Dense Matrix Multiplication on Csr format.
 */
-template <int XPU, typename IdType, typename DType>
+template <int XPU, typename IdType, int bits>
 void SDDMMCsr(const std::string& op,
              const BcastOff& bcast,
              const aten::CSRMatrix& csr,
@@ -56,7 +56,7 @@ void SDDMMCsr(const std::string& op,
 /*!
 * \brief Generalized Sampled Dense-Dense Matrix Multiplication on Coo format.
 */
-template <int XPU, typename IdType, typename DType>
+template <int XPU, typename IdType, int bits>
 void SDDMMCoo(const std::string& op,
              const BcastOff& bcast,
              const aten::COOMatrix& coo,
@@ -69,7 +69,7 @@ void SDDMMCoo(const std::string& op,
 /*!
 * \brief Segment reduce.
 */
-template <int XPU, typename IdType, typename DType>
+template <int XPU, typename IdType, int bits>
 void SegmentReduce(const std::string& op,
                   NDArray feat,
                   NDArray offsets,
@@ -79,7 +79,7 @@ void SegmentReduce(const std::string& op,
 /*!
 * \brief Backward function of segment cmp.
 */
-template <int XPU, typename IdType, typename DType>
+template <int XPU, typename IdType, int bits>
 void BackwardSegmentCmp(NDArray feat,
                        NDArray arg,
                        NDArray out);

--- a/src/runtime/cuda/cuda_common.h
+++ b/src/runtime/cuda/cuda_common.h
@@ -70,6 +70,46 @@ inline bool is_zero<dim3>(dim3 size) {
    CHECK(e == CUBLAS_STATUS_SUCCESS) << "CUBLAS ERROR: " << e;    \
  }
+/*
+ * \brief Cast data type to cudaDataType_t.
+ */
+template <typename T>
+struct cuda_dtype {
+  static constexpr cudaDataType_t value = CUDA_R_32F;
+};
+template <>
+struct cuda_dtype<half> {
+  static constexpr cudaDataType_t value = CUDA_R_16F;
+};
+template <>
+struct cuda_dtype<float> {
+  static constexpr cudaDataType_t value = CUDA_R_32F;
+};
+template <>
+struct cuda_dtype<double> {
+  static constexpr cudaDataType_t value = CUDA_R_64F;
+};
+/*
+ * \brief Cast index data type to cusparseIndexType_t.
+ */
+template <typename T>
+struct cusparse_idtype {
+  static constexpr cusparseIndexType_t value = CUSPARSE_INDEX_32I;
+};
+template <>
+struct cusparse_idtype<int32_t> {
+  static constexpr cusparseIndexType_t value = CUSPARSE_INDEX_32I;
+};
+template <>
+struct cusparse_idtype<int64_t> {
+  static constexpr cusparseIndexType_t value = CUSPARSE_INDEX_64I;
+};
 /*! \brief Thread local workspace */
 class CUDAThreadEntry {