[feature] Supporting half precision floating data type (fp16). (#2552)

* add tvm as submodule * compilation is ok but calling fails * can call now * pack multiple modules, change names * upd * upd * upd * fix cmake * upd * upd * upd * upd * fix * relative path * upd * upd * upd * singleton * upd * trigger * fix * upd * count reducible * upd * upd * upd * upd * upd * upd * upd * upd * upd * only keep related files * upd * upd * upd * upd * lint * lint * lint * lint * pylint * upd * upd * compilation * fix * upd * upd * upd * upd * upd * upd * upd doc * refactor * fix * upd number Co-authored-by: Zhi Lin <linzhilynn@gmail.com> Co-authored-by: Ubuntu <ubuntu@ip-172-31-42-78.us-east-2.compute.internal> Co-authored-by: Ubuntu <ubuntu@ip-172-31-21-156.us-east-2.compute.internal> Co-authored-by: Jinjing Zhou <VoVAllen@users.noreply.github.com>

[feature] Supporting half precision floating data type (fp16). (#2552)
* add tvm as submodule * compilation is ok but calling fails * can call now * pack multiple modules, change names * upd * upd * upd * fix cmake * upd * upd * upd * upd * fix * relative path * upd * upd * upd * singleton * upd * trigger * fix * upd * count reducible * upd * upd * upd * upd * upd * upd * upd * upd * upd * only keep related files * upd * upd * upd * upd * lint * lint * lint * lint * pylint * upd * upd * compilation * fix * upd * upd * upd * upd * upd * upd * upd doc * refactor * fix * upd number Co-authored-by: Zhi Lin <linzhilynn@gmail.com> Co-authored-by: Ubuntu <ubuntu@ip-172-31-42-78.us-east-2.compute.internal> Co-authored-by: Ubuntu <ubuntu@ip-172-31-21-156.us-east-2.compute.internal> Co-authored-by: Jinjing Zhou <VoVAllen@users.noreply.github.com>
7bab1365 · Zihao Ye · GitHub · a7e941c3 · 7bab1365 · 7bab1365
Unverified Commit 7bab1365 authored Jan 28, 2021 by Zihao Ye Committed by GitHub Jan 28, 2021
20 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -25,6 +25,7 @@ endif()
 dgl_option(USE_CUDA "Build with CUDA" OFF)
 dgl_option(USE_OPENMP "Build with OpenMP" ON)
 dgl_option(USE_AVX "Build with AVX optimization" ON)
+dgl_option(USE_FP16 "Build with fp16 support to enable mixed precision training" OFF)
 dgl_option(USE_TVM "Build with TVM kernels" OFF)
 dgl_option(BUILD_CPP_TEST "Build cpp unittest executables" OFF)
 dgl_option(LIBCXX_ENABLE_PARALLEL_ALGORITHMS "Enable the parallel algorithms library. This requires the PSTL to be available." OFF)
@@ -101,13 +102,22 @@ if(USE_OPENMP)
    set(CMAKE_C_FLAGS "${OpenMP_C_FLAGS} ${CMAKE_C_FLAGS}")
    set(CMAKE_CXX_FLAGS "${OpenMP_CXX_FLAGS} ${CMAKE_CXX_FLAGS}")
  endif(OPENMP_FOUND)
+  message(STATUS "Build with OpenMP.")
 endif(USE_OPENMP)

 if(USE_AVX)
  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DUSE_AVX")
  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_AVX")
+  message(STATUS "Build with AVX optimization.")
 endif(USE_AVX)

+# Build with fp16 to support mixed precision training.
+if(USE_FP16)
+  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DUSE_FP16")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_FP16")
+  message(STATUS "Build with fp16 to support mixed precision training")
+endif(USE_FP16)
+
 # To compile METIS correct for DGL.
 if(MSVC)
  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /DIDXTYPEWIDTH=64 /DREALTYPEWIDTH=32")
@@ -194,6 +204,7 @@ if(USE_TVM)
  target_include_directories(dgl PRIVATE "featgraph/include")
  add_subdirectory("featgraph/")
  list(APPEND DGL_LINKER_LIBS featgraph_runtime)
+  message(STATUS "Build with TVM runtime and featgraph kernels.")
 endif(USE_TVM)

 # support PARALLEL_ALGORITHMS

--- a/cmake/config.cmake
+++ b/cmake/config.cmake
@@ -49,3 +49,5 @@ set(BUILD_TORCH ON)
 # Whether to enable CUDA kernels compiled with TVM.
 set(USE_TVM OFF)

+# Whether to enable fp16 to support mixed precision training.
+set(USE_FP16 OFF)
--- a/cmake/modules/CUDA.cmake
+++ b/cmake/modules/CUDA.cmake
@@ -136,7 +136,7 @@ function(dgl_select_nvcc_arch_flags out_variable)
  string(REGEX MATCHALL "[0-9]+"   __cuda_arch_ptx "${__cuda_arch_ptx}")
  mshadow_list_unique(__cuda_arch_bin __cuda_arch_ptx)

-  set(__nvcc_flags "")
+  set(__nvcc_flags "--expt-relaxed-constexpr")
  set(__nvcc_archs_readable "")

  # Tell NVCC to add binaries for the specified GPUs

--- a/docs/.gitignore
+++ b/docs/.gitignore
@@ -2,4 +2,5 @@ build

 # tutorials are auto-generated
 source/tutorials
+source/new-tutorial
 source/generated
--- a/docs/source/api/python/dgl.ops.rst
+++ b/docs/source/api/python/dgl.ops.rst
@@ -246,6 +246,21 @@ DGL provide operators to reduce value tensor along the first dimension by segmen

   segment_reduce

+Supported Data types
+--------------------
+Operators defined in ``dgl.ops`` support floating point data types, i.e. the operands
+must be ``half`` (``float16``) /``float``/``double`` tensors.
+The input tensors must have the same data type (if one input tensor has type float16
+and the other input tensor has data type float32, user must convert one of them to
+align with the other one).
+
+``float16`` data type support is disabled by default as it has a minimum GPU
+compute capacity requirement of ``sm_53`` (Pascal, Volta, Turing and Ampere
+architectures).
+
+User can enable float16 for mixed precision training by compiling DGL from source
+(see :doc:`Mixed Precision Training </guide/mixed_precision>` tutorial for details).
+
 Relation with Message Passing APIs
 ----------------------------------


--- a/docs/source/guide/message-api.rst
+++ b/docs/source/guide/message-api.rst
@@ -97,3 +97,12 @@ features ``ft``, and finally multiply ``ft`` by 2 to get the result
 The math formula for the above function is:

 .. math::  {final\_ft}_i = 2 * \sum_{j\in\mathcal{N}(i)} ({ft}_j * a_{ij})
+
+DGL's built-in functions support floating point data types, i.e. the feature must
+be ``half`` (``float16``) /``float``/``double`` tensors.
+``float16`` data type support is disabled by default as it has a minimum GPU
+compute capacity requirement of ``sm_53`` (Pascal, Volta, Turing and Ampere
+architectures).
+
+User can enable float16 for mixed precision training by compiling DGL from source
+(see :doc:`Mixed Precision Training <mixed_precision>` tutorial for details).
\ No newline at end of file
--- a/docs/source/guide/mixed_precision.rst
+++ b/docs/source/guide/mixed_precision.rst
+.. _guide-mixed_precision:
+
+Chapter 8: Mixed Precision Training
+===================================
+DGL is compatible with `PyTorch's automatic mixed precision package
+<https://pytorch.org/docs/stable/amp.html>`_
+for mixed precision training, thus saving both training time and GPU memory
+consumption. To enable this feature, users need to install PyTorch 1.6+ and
+build DGL from source file to support ``float16`` data type (this feature is
+still in its beta stage and we do not provide official pre-built pip wheels).
+
+Installation
+------------
+First download DGL's source code from GitHub and build the shared library
+with flag ``USE_FP16=ON``.
+
+.. code:: bash
+
+   git clone --recurse-submodules https://github.com/dmlc/dgl.git
+   cd dgl
+   mkdir build
+   cd build
+   cmake -DUSE_CUDA=ON -DUSE_FP16=ON ..
+   make -j
+
+Then install the Python binding.
+
+.. code:: bash
+
+   cd ../python
+   python setup.py install
+
+Message-Passing with Half Precision
+-----------------------------------
+DGL with fp16 support allows message-passing on ``float16`` features for both
+UDF(User Defined Function)s and built-in functions (e.g. ``dgl.function.sum``,
+``dgl.function.copy_u``).
+
+The following examples shows how to use DGL's message-passing API on half-precision
+features:
+
+    >>> import torch
+    >>> import dgl
+    >>> import dgl.function as fn
+    >>> g = dgl.rand_graph(30, 100).to(0)  # Create a graph on GPU w/ 30 nodes and 100 edges.
+    >>> g.ndata['h'] = torch.rand(30, 16).to(0).half()  # Create fp16 node features.
+    >>> g.edata['w'] = torch.rand(100, 1).to(0).half()  # Create fp16 edge features.
+    >>> # Use DGL's built-in functions for message passing on fp16 features.
+    >>> g.update_all(fn.u_mul_e('h', 'w', 'm'), fn.sum('m', 'x'))
+    >>> g.ndata['x'][0]
+    tensor([0.3391, 0.2208, 0.7163, 0.6655, 0.7031, 0.5854, 0.9404, 0.7720, 0.6562,
+            0.4028, 0.6943, 0.5908, 0.9307, 0.5962, 0.7827, 0.5034],
+           device='cuda:0', dtype=torch.float16)
+    >>> g.apply_edges(fn.u_dot_v('h', 'x', 'hx'))
+    >>> g.edata['hx'][0]
+    tensor([5.4570], device='cuda:0', dtype=torch.float16)
+    >>> # Use UDF(User Defined Functions) for message passing on fp16 features.
+    >>> def message(edges):
+    ...     return {'m': edges.src['h'] * edges.data['w']}
+    ...
+    >>> def reduce(nodes):
+    ...     return {'y': torch.sum(nodes.mailbox['m'], 1)}
+    ...
+    >>> def dot(edges):
+    ...     return {'hy': (edges.src['h'] * edges.dst['y']).sum(-1, keepdims=True)}
+    ...
+    >>> g.update_all(message, reduce)
+    >>> g.ndata['y'][0]
+    tensor([0.3394, 0.2209, 0.7168, 0.6655, 0.7026, 0.5854, 0.9404, 0.7720, 0.6562,
+            0.4028, 0.6943, 0.5908, 0.9307, 0.5967, 0.7827, 0.5039],
+           device='cuda:0', dtype=torch.float16)
+    >>> g.apply_edges(dot)
+    >>> g.edata['hy'][0]
+    tensor([5.4609], device='cuda:0', dtype=torch.float16)
+
+
+End-to-End Mixed Precision Training
+-----------------------------------
+DGL relies on PyTorch's AMP package for mixed precision training,
+and the user experience is exactly
+the same as `PyTorch's <https://pytorch.org/docs/stable/notes/amp_examples.html>`_.
+
+By wrapping the forward pass (including loss computation) of your GNN model with
+``torch.cuda.amp.autocast()``, PyTorch automatically selects the appropriate datatype
+for each op and tensor. Half precision tensors are memory efficient, most operators
+on half precision tensors are faster as they leverage GPU's tensorcores.
+
+Small Gradients in ``float16`` format have underflow problems (flush to zero), and
+PyTorch provides a ``GradScaler`` module to address this issue. ``GradScaler`` multiplies
+loss by a factor and invokes backward pass on scaled loss, and unscales graidents before
+optimizers update the parameters, thus preventing the underflow problem.
+The scale factor is determined automatically.
+
+Following is the training script of 3-layer GAT on Reddit dataset (w/ 114 million edges),
+note the difference in codes when ``use_fp16`` is activated/not activated:
+
+.. code::
+
+    import torch 
+    import torch.nn as nn
+    import torch.nn.functional as F
+    from torch.cuda.amp import autocast, GradScaler
+    import dgl
+    from dgl.data import RedditDataset
+    from dgl.nn import GATConv
+
+    use_fp16 = True
+
+
+    class GAT(nn.Module):
+        def __init__(self,
+                     in_feats,
+                     n_hidden,
+                     n_classes,
+                     heads):
+            super().__init__()
+            self.layers = nn.ModuleList()
+            self.layers.append(GATConv(in_feats, n_hidden, heads[0], activation=F.elu))
+            self.layers.append(GATConv(n_hidden * heads[0], n_hidden, heads[1], activation=F.elu))
+            self.layers.append(GATConv(n_hidden * heads[1], n_classes, heads[2], activation=F.elu))
+
+        def forward(self, g, h):
+            for l, layer in enumerate(self.layers):
+                h = layer(g, h)
+                if l != len(self.layers) - 1:
+                    h = h.flatten(1)
+                else:
+                    h = h.mean(1)
+            return h
+
+    # Data loading
+    data = RedditDataset()
+    device = torch.device(0)
+    g = data[0]
+    g = dgl.add_self_loop(g)
+    g = g.int().to(device)
+    train_mask = g.ndata['train_mask']
+    features = g.ndata['feat']
+    labels = g.ndata['label']
+    in_feats = features.shape[1]
+    n_hidden = 256
+    n_classes = data.num_classes
+    n_edges = g.number_of_edges()
+    heads = [1, 1, 1]
+    model = GAT(in_feats, n_hidden, n_classes, heads)
+    model = model.to(device)
+
+    # Create optimizer
+    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=5e-4)
+    # Create gradient scaler
+    scaler = GradScaler()
+
+    for epoch in range(100):
+        model.train()
+        optimizer.zero_grad()
+
+        # Wrap forward pass with autocast
+        with autocast(enabled=use_fp16):
+            logits = model(g, features)
+            loss = F.cross_entropy(logits[train_mask], labels[train_mask])
+        
+        if use_fp16:
+            # Backprop w/ gradient scaling
+            scaler.scale(loss).backward()
+            scaler.step(optimizer)
+            scaler.update()
+        else:
+            loss.backward()
+            optimizer.step()
+
+        print('Epoch {} | Loss {}'.format(epoch, loss.item()))
+
+
+On a NVIDIA V100 (16GB) machine, training this model without fp16 consumes
+15.2GB GPU memory; with fp16 turned on, the training consumes 12.8G
+GPU memory, the loss converges to similar values in both settings.
+If we change the number of heads to ``[2, 2, 2]``, training without fp16
+triggers GPU OOM(out-of-memory) issue while training with fp16 consumes
+15.7G GPU memory.
+
+DGL is still improving its half-precision support and the compute kernel's
+performance is far from optimal, please stay tuned to our future updates.
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -97,6 +97,7 @@ Getting Started
   guide/training
   guide/minibatch
   guide/distributed
+   guide/mixed_precision

 .. toctree::
   :maxdepth: 2

--- a/include/dgl/aten/macro.h
+++ b/include/dgl/aten/macro.h
@@ -124,6 +124,23 @@
  }                                                           \
 } while (0)

+#define ATEN_FLOAT_BITS_SWITCH(val, bits, val_name, ...) do {  \
+  CHECK_EQ((val).code, kDLFloat)                              \
+    << (val_name) << " must be float type";                   \
+  if ((val).bits == 16) {                                     \
+    constexpr int bits = 16;                                  \
+    {__VA_ARGS__}                                             \
+  } else if ((val).bits == 32) {                              \
+    constexpr int bits = 32;                                  \
+    {__VA_ARGS__}                                             \
+  } else if ((val).bits == 64) {                              \
+    constexpr int bits = 64;                                  \
+    {__VA_ARGS__}                                             \
+  } else {                                                    \
+    LOG(FATAL) << (val_name) << " can only be float32 or float64";  \
+  }                                                           \
+} while (0)
+
 /*
 * Dispatch according to data type (int32, int64, float32 or float64):
 *

--- a/python/dgl/backend/pytorch/sparse.py
+++ b/python/dgl/backend/pytorch/sparse.py
 import torch as th
+from distutils.version import LooseVersion
 from ...base import is_all, ALL
 from ...sparse import _gspmm, _gsddmm, _segment_reduce, _bwd_segment_cmp, _reverse

+if LooseVersion(th.__version__) >= LooseVersion("1.6.0"):
+    from torch.cuda.amp import custom_fwd, custom_bwd
+else:
+    import functools
+    """PyTorch natively supports automatic mixed precision in DGL 1.6, we redefine
+    the custom_fwd and custom_bwd function to be compatible with DGL 1.5.
+    """
+    def custom_fwd(**kwargs):
+        def custom_fwd_inner(fwd):
+            @functools.wraps(fwd)
+            def decorate_fwd(*args, **kwargs):
+                return fwd(*args, **kwargs)
+            return decorate_fwd
+        return custom_fwd_inner
+
+    def custom_bwd(bwd):
+        @functools.wraps(bwd)
+        def decorate_bwd(*args, **kwargs):
+            return bwd(*args, **kwargs)
+        return decorate_bwd
+
 __all__ = ['gspmm', 'gsddmm', 'edge_softmax', 'segment_reduce']


@@ -60,6 +82,7 @@ def _expand(x, shape):

 class GSpMM(th.autograd.Function):
    @staticmethod
+    @custom_fwd(cast_inputs=th.float16)
    def forward(ctx, gidx, op, reduce_op, X, Y):
        out, (argX, argY) = _gspmm(gidx, op, reduce_op, X, Y)
        ctx.backward_cache = gidx, op, reduce_op
@@ -67,6 +90,7 @@ class GSpMM(th.autograd.Function):
        return out

    @staticmethod
+    @custom_bwd
    def backward(ctx, dZ):
        gidx, op, reduce_op = ctx.backward_cache
        X, Y, argX, argY = ctx.saved_tensors
@@ -120,6 +144,7 @@ class GSpMM(th.autograd.Function):

 class GSDDMM(th.autograd.Function):
    @staticmethod
+    @custom_fwd(cast_inputs=th.float16)
    def forward(ctx, gidx, op, X, Y, lhs_target, rhs_target):
        out = _gsddmm(gidx, op, X, Y, lhs_target, rhs_target)
        ctx.backward_cache = gidx, op, lhs_target, rhs_target
@@ -127,6 +152,7 @@ class GSDDMM(th.autograd.Function):
        return out

    @staticmethod
+    @custom_bwd
    def backward(ctx, dZ):
        gidx, op, lhs_target, rhs_target = ctx.backward_cache
        X, Y = ctx.saved_tensors
@@ -179,6 +205,7 @@ class GSDDMM(th.autograd.Function):

 class EdgeSoftmax(th.autograd.Function):
    @staticmethod
+    @custom_fwd(cast_inputs=th.float16)
    def forward(ctx, gidx, score, eids, norm_by):
        """Forward function.

@@ -208,6 +235,7 @@ class EdgeSoftmax(th.autograd.Function):
        return out

    @staticmethod
+    @custom_bwd
    def backward(ctx, grad_out):
        """Backward function.

@@ -233,6 +261,7 @@ class EdgeSoftmax(th.autograd.Function):

 class SegmentReduce(th.autograd.Function):
    @staticmethod
+    @custom_fwd(cast_inputs=th.float16)
    def forward(ctx, op, x, offsets):
        y, arg = _segment_reduce(op, x, offsets)
        ctx.save_for_backward(arg, offsets)
@@ -240,6 +269,7 @@ class SegmentReduce(th.autograd.Function):
        return y

    @staticmethod
+    @custom_bwd
    def backward(ctx, dy):
        op = ctx.backward_cache
        arg, offsets = ctx.saved_tensors

--- a/python/dgl/sparse.py
+++ b/python/dgl/sparse.py
@@ -159,6 +159,7 @@ def _gspmm(gidx, op, reduce_op, u, e):
        if F.ndim(e) == 1:
            e = F.unsqueeze(e, -1)
            expand_e = True
+
    ctx = F.context(u) if use_u else F.context(e)
    dtype = F.dtype(u) if use_u else F.dtype(e)
    u_shp = F.shape(u) if use_u else (0,)

--- a/src/array/cpu/sddmm.cc
+++ b/src/array/cpu/sddmm.cc
@@ -41,9 +41,22 @@ namespace aten {
    }                                                                   \
  } while (0)

+#define SWITCH_BITS(bits, DType, ...)                           \
+  do {                                                          \
+    if ((bits) == 16 || (bits) == 32) {                         \
+      typedef float DType;                                      \
+      { __VA_ARGS__ }                                           \
+    } else if ((bits) == 64) {                                  \
+      typedef double DType;                                     \
+      { __VA_ARGS__ }                                           \
+    } else {                                                    \
+      LOG(FATAL) << "Data type not renogized with bits " << bits; \
+    }                                                           \
+  } while (0)
+

 /*! \brief Generalized SDDMM on Csr format. */
-template <int XPU, typename IdType, typename DType>
+template <int XPU, typename IdType, int bits>
 void SDDMMCsr(const std::string& op,
              const BcastOff& bcast,
              const CSRMatrix& csr,
@@ -52,32 +65,43 @@ void SDDMMCsr(const std::string& op,
              NDArray out,
              int lhs_target,
              int rhs_target) {
+  SWITCH_BITS(bits, DType, {
    SWITCH_OP(op, Op, {
      SWITCH_TARGET(lhs_target, rhs_target, LhsTarget, RhsTarget, {
        cpu::SDDMMCsr<IdType, DType, Op, LhsTarget, RhsTarget>(bcast, csr, lhs, rhs, out);
      });
    });
+  });
 }

-template void SDDMMCsr<kDLCPU, int32_t, float>(
+template void SDDMMCsr<kDLCPU, int32_t, 16>(
+    const std::string& op, const BcastOff& bcast, const CSRMatrix& csr,
+    NDArray lhs, NDArray rhs, NDArray out,
+    int lhs_target, int rhs_target);
+template void SDDMMCsr<kDLCPU, int64_t, 16>(
    const std::string& op, const BcastOff& bcast, const CSRMatrix& csr,
    NDArray lhs, NDArray rhs, NDArray out,
    int lhs_target, int rhs_target);
-template void SDDMMCsr<kDLCPU, int64_t, float>(
+template void SDDMMCsr<kDLCPU, int32_t, 32>(
    const std::string& op, const BcastOff& bcast, const CSRMatrix& csr,
    NDArray lhs, NDArray rhs, NDArray out,
    int lhs_target, int rhs_target);
-template void SDDMMCsr<kDLCPU, int32_t, double>(
+template void SDDMMCsr<kDLCPU, int64_t, 32>(
    const std::string& op, const BcastOff& bcast, const CSRMatrix& csr,
    NDArray lhs, NDArray rhs, NDArray out,
    int lhs_target, int rhs_target);
-template void SDDMMCsr<kDLCPU, int64_t, double>(
+template void SDDMMCsr<kDLCPU, int32_t, 64>(
    const std::string& op, const BcastOff& bcast, const CSRMatrix& csr,
    NDArray lhs, NDArray rhs, NDArray out,
    int lhs_target, int rhs_target);
+template void SDDMMCsr<kDLCPU, int64_t, 64>(
+    const std::string& op, const BcastOff& bcast, const CSRMatrix& csr,
+    NDArray lhs, NDArray rhs, NDArray out,
+    int lhs_target, int rhs_target);
+

 /*! \brief Generalized SDDMM on Coo format. */
-template <int XPU, typename IdType, typename DType>
+template <int XPU, typename IdType, int bits>
 void SDDMMCoo(const std::string& op,
              const BcastOff& bcast,
              const COOMatrix& coo,
@@ -86,29 +110,40 @@ void SDDMMCoo(const std::string& op,
              NDArray out,
              int lhs_target,
              int rhs_target) {
+  SWITCH_BITS(bits, DType, {
    SWITCH_OP(op, Op, {
      SWITCH_TARGET(lhs_target, rhs_target, LhsTarget, RhsTarget, {
        cpu::SDDMMCoo<IdType, DType, Op, LhsTarget, RhsTarget>(bcast, coo, lhs, rhs, out);
      });
    });
+  });
 }

-template void SDDMMCoo<kDLCPU, int32_t, float>(
+template void SDDMMCoo<kDLCPU, int32_t, 16>(
    const std::string& op, const BcastOff& bcast, const COOMatrix& coo,
    NDArray lhs, NDArray rhs, NDArray out,
    int lhs_target, int rhs_target);
-template void SDDMMCoo<kDLCPU, int64_t, float>(
+template void SDDMMCoo<kDLCPU, int64_t, 16>(
    const std::string& op, const BcastOff& bcast, const COOMatrix& coo,
    NDArray lhs, NDArray rhs, NDArray out,
    int lhs_target, int rhs_target);
-template void SDDMMCoo<kDLCPU, int32_t, double>(
+template void SDDMMCoo<kDLCPU, int32_t, 32>(
    const std::string& op, const BcastOff& bcast, const COOMatrix& coo,
    NDArray lhs, NDArray rhs, NDArray out,
    int lhs_target, int rhs_target);
-template void SDDMMCoo<kDLCPU, int64_t, double>(
+template void SDDMMCoo<kDLCPU, int64_t, 32>(
    const std::string& op, const BcastOff& bcast, const COOMatrix& coo,
    NDArray lhs, NDArray rhs, NDArray out,
    int lhs_target, int rhs_target);
+template void SDDMMCoo<kDLCPU, int32_t, 64>(
+    const std::string& op, const BcastOff& bcast, const COOMatrix& coo,
+    NDArray lhs, NDArray rhs, NDArray out,
+    int lhs_target, int rhs_target);
+template void SDDMMCoo<kDLCPU, int64_t, 64>(
+    const std::string& op, const BcastOff& bcast, const COOMatrix& coo,
+    NDArray lhs, NDArray rhs, NDArray out,
+    int lhs_target, int rhs_target);
+

 }  // namespace aten
 }  // namespace dgl
--- a/src/array/cpu/segment_reduce.cc
+++ b/src/array/cpu/segment_reduce.cc
@@ -12,7 +12,7 @@ namespace dgl {
 namespace aten {

 /*! \brief Segment Reduce operator. */
-template <int XPU, typename IdType, typename DType>
+template <int XPU, typename IdType, int bits>
 void SegmentReduce(
    const std::string& op,
    NDArray feat,
@@ -20,65 +20,94 @@ void SegmentReduce(
    NDArray out,
    NDArray arg) {
  if (op == "sum") {
+    SWITCH_BITS(bits, DType, {
      cpu::SegmentSum<IdType, DType>(feat, offsets, out);
+    });
  } else if (op == "max" || op == "min") {
-    if (op == "max")
+    if (op == "max") {
+      SWITCH_BITS(bits, DType, {
        cpu::SegmentCmp<IdType, DType, cpu::op::Max<DType>>(
            feat, offsets, out, arg);
-    else
+      });
+    } else {
+      SWITCH_BITS(bits, DType, {
          cpu::SegmentCmp<IdType, DType, cpu::op::Min<DType>>(
              feat, offsets, out, arg);
+      });
+    }
  } else {
    LOG(FATAL) << "Unsupported reduce function " << op;
  }
 }

 /*! \brief Backward function of segment cmp.*/
-template <int XPU, typename IdType, typename DType>
+template <int XPU, typename IdType, int bits>
 void BackwardSegmentCmp(
    NDArray feat,
    NDArray arg,
    NDArray out) {
+  SWITCH_BITS(bits, DType, {
    cpu::BackwardSegmentCmp<IdType, DType>(feat, arg, out);
+  });
 }

-template void SegmentReduce<kDLCPU, int32_t, float>(
+template void SegmentReduce<kDLCPU, int32_t, 16>(
+    const std::string &op,
+    NDArray feat,
+    NDArray offsets,
+    NDArray out,
+    NDArray arg);
+template void SegmentReduce<kDLCPU, int64_t, 16>(
+    const std::string &op,
+    NDArray feat,
+    NDArray offsets,
+    NDArray out,
+    NDArray arg);
+template void SegmentReduce<kDLCPU, int32_t, 32>(
    const std::string &op,
    NDArray feat,
    NDArray offsets,
    NDArray out,
    NDArray arg);
-template void SegmentReduce<kDLCPU, int64_t, float>(
+template void SegmentReduce<kDLCPU, int64_t, 32>(
    const std::string &op,
    NDArray feat,
    NDArray offsets,
    NDArray out,
    NDArray arg);
-template void SegmentReduce<kDLCPU, int32_t, double>(
+template void SegmentReduce<kDLCPU, int32_t, 64>(
    const std::string &op,
    NDArray feat,
    NDArray offsets,
    NDArray out,
    NDArray arg);
-template void SegmentReduce<kDLCPU, int64_t, double>(
+template void SegmentReduce<kDLCPU, int64_t, 64>(
    const std::string &op,
    NDArray feat,
    NDArray offsets,
    NDArray out,
    NDArray arg);
-template void BackwardSegmentCmp<kDLCPU, int32_t, float>(
+template void BackwardSegmentCmp<kDLCPU, int32_t, 16>(
+    NDArray feat,
+    NDArray arg,
+    NDArray out);
+template void BackwardSegmentCmp<kDLCPU, int64_t, 16>(
+    NDArray feat,
+    NDArray arg,
+    NDArray out);
+template void BackwardSegmentCmp<kDLCPU, int32_t, 32>(
    NDArray feat,
    NDArray arg,
    NDArray out);
-template void BackwardSegmentCmp<kDLCPU, int64_t, float>(
+template void BackwardSegmentCmp<kDLCPU, int64_t, 32>(
    NDArray feat,
    NDArray arg,
    NDArray out);
-template void BackwardSegmentCmp<kDLCPU, int32_t, double>(
+template void BackwardSegmentCmp<kDLCPU, int32_t, 64>(
    NDArray feat,
    NDArray arg,
    NDArray out);
-template void BackwardSegmentCmp<kDLCPU, int64_t, double>(
+template void BackwardSegmentCmp<kDLCPU, int64_t, 64>(
    NDArray feat,
    NDArray arg,
    NDArray out);

--- a/src/array/cpu/spmm.cc
+++ b/src/array/cpu/spmm.cc
@@ -10,7 +10,7 @@ namespace dgl {
 namespace aten {

 /*! \brief Generalized SpMM on Csr format. */
-template <int XPU, typename IdType, typename DType>
+template <int XPU, typename IdType, int bits>
 void SpMMCsr(const std::string& op, const std::string& reduce,
             const BcastOff& bcast,
             const CSRMatrix& csr,
@@ -19,10 +19,13 @@ void SpMMCsr(const std::string& op, const std::string& reduce,
             NDArray out,
             std::vector<NDArray> out_aux) {
  if (reduce == "sum") {
+    SWITCH_BITS(bits, DType, {
      SWITCH_OP(op, Op, {
        cpu::SpMMSumCsr<IdType, DType, Op>(bcast, csr, ufeat, efeat, out);
      });
+    });
  } else if (reduce == "max" || reduce == "min") {
+    SWITCH_BITS(bits, DType, {
      SWITCH_OP(op, Op, {
        if (reduce == "max")
          cpu::SpMMCmpCsr<IdType, DType, Op, cpu::op::Max<DType>>(
@@ -31,30 +34,40 @@ void SpMMCsr(const std::string& op, const std::string& reduce,
          cpu::SpMMCmpCsr<IdType, DType, Op, cpu::op::Min<DType>>(
              bcast, csr, ufeat, efeat, out, out_aux[0], out_aux[1]);
      });
+    });
  } else {
    LOG(FATAL) << "Unsupported SpMM reducer: " << reduce;
  }
 }

-template void SpMMCsr<kDLCPU, int32_t, float>(
+template void SpMMCsr<kDLCPU, int32_t, 16>(
+    const std::string& op, const std::string& reduce,
+    const BcastOff& bcast, const CSRMatrix& csr,
+    NDArray ufeat, NDArray efeat, NDArray out, std::vector<NDArray> out_aux);
+template void SpMMCsr<kDLCPU, int64_t, 16>(
    const std::string& op, const std::string& reduce,
    const BcastOff& bcast, const CSRMatrix& csr,
    NDArray ufeat, NDArray efeat, NDArray out, std::vector<NDArray> out_aux);
-template void SpMMCsr<kDLCPU, int64_t, float>(
+template void SpMMCsr<kDLCPU, int32_t, 32>(
    const std::string& op, const std::string& reduce,
    const BcastOff& bcast, const CSRMatrix& csr,
    NDArray ufeat, NDArray efeat, NDArray out, std::vector<NDArray> out_aux);
-template void SpMMCsr<kDLCPU, int32_t, double>(
+template void SpMMCsr<kDLCPU, int64_t, 32>(
    const std::string& op, const std::string& reduce,
    const BcastOff& bcast, const CSRMatrix& csr,
    NDArray ufeat, NDArray efeat, NDArray out, std::vector<NDArray> out_aux);
-template void SpMMCsr<kDLCPU, int64_t, double>(
+template void SpMMCsr<kDLCPU, int32_t, 64>(
    const std::string& op, const std::string& reduce,
    const BcastOff& bcast, const CSRMatrix& csr,
    NDArray ufeat, NDArray efeat, NDArray out, std::vector<NDArray> out_aux);
+template void SpMMCsr<kDLCPU, int64_t, 64>(
+    const std::string& op, const std::string& reduce,
+    const BcastOff& bcast, const CSRMatrix& csr,
+    NDArray ufeat, NDArray efeat, NDArray out, std::vector<NDArray> out_aux);
+

 /*! \brief Generalized SpMM on Coo format. */
-template <int XPU, typename IdType, typename DType>
+template <int XPU, typename IdType, int bits>
 void SpMMCoo(const std::string& op, const std::string& reduce,
             const BcastOff& bcast,
             const COOMatrix& coo,
@@ -63,10 +76,13 @@ void SpMMCoo(const std::string& op, const std::string& reduce,
             NDArray out,
             std::vector<NDArray> out_aux) {
  if (reduce == "sum") {
+    SWITCH_BITS(bits, DType, {
      SWITCH_OP(op, Op, {
        cpu::SpMMSumCoo<IdType, DType, Op>(bcast, coo, ufeat, efeat, out);
      });
+    });
  } else if (reduce == "max" || reduce == "min") {
+    SWITCH_BITS(bits, DType, {
      SWITCH_OP(op, Op, {
        if (reduce == "max")
          cpu::SpMMCmpCoo<IdType, DType, Op, cpu::op::Max<DType>>(
@@ -75,27 +91,37 @@ void SpMMCoo(const std::string& op, const std::string& reduce,
          cpu::SpMMCmpCoo<IdType, DType, Op, cpu::op::Min<DType>>(
              bcast, coo, ufeat, efeat, out, out_aux[0], out_aux[1]);
      });
+    });
  } else {
    LOG(FATAL) << "Unsupported SpMM reducer: " << reduce;
  }
 }

-template void SpMMCoo<kDLCPU, int32_t, float>(
+template void SpMMCoo<kDLCPU, int32_t, 16>(
+    const std::string& op, const std::string& reduce,
+    const BcastOff& bcast, const COOMatrix& coo,
+    NDArray ufeat, NDArray efeat, NDArray out, std::vector<NDArray> out_aux);
+template void SpMMCoo<kDLCPU, int64_t, 16>(
    const std::string& op, const std::string& reduce,
    const BcastOff& bcast, const COOMatrix& coo,
    NDArray ufeat, NDArray efeat, NDArray out, std::vector<NDArray> out_aux);
-template void SpMMCoo<kDLCPU, int64_t, float>(
+template void SpMMCoo<kDLCPU, int32_t, 32>(
    const std::string& op, const std::string& reduce,
    const BcastOff& bcast, const COOMatrix& coo,
    NDArray ufeat, NDArray efeat, NDArray out, std::vector<NDArray> out_aux);
-template void SpMMCoo<kDLCPU, int32_t, double>(
+template void SpMMCoo<kDLCPU, int64_t, 32>(
    const std::string& op, const std::string& reduce,
    const BcastOff& bcast, const COOMatrix& coo,
    NDArray ufeat, NDArray efeat, NDArray out, std::vector<NDArray> out_aux);
-template void SpMMCoo<kDLCPU, int64_t, double>(
+template void SpMMCoo<kDLCPU, int32_t, 64>(
    const std::string& op, const std::string& reduce,
    const BcastOff& bcast, const COOMatrix& coo,
    NDArray ufeat, NDArray efeat, NDArray out, std::vector<NDArray> out_aux);
+template void SpMMCoo<kDLCPU, int64_t, 64>(
+    const std::string& op, const std::string& reduce,
+    const BcastOff& bcast, const COOMatrix& coo,
+    NDArray ufeat, NDArray efeat, NDArray out, std::vector<NDArray> out_aux);
+

 }  // namespace aten
 }  // namespace dgl
--- a/src/array/cpu/spmm_binary_ops.h
+++ b/src/array/cpu/spmm_binary_ops.h
@@ -146,6 +146,19 @@ constexpr DType Min<DType>::zero;
    }                                                           \
  } while (0)

+#define SWITCH_BITS(bits, DType, ...)                           \
+  do {                                                          \
+    if ((bits) == 16 || (bits) == 32) {                         \
+      typedef float DType;                                      \
+      { __VA_ARGS__ }                                           \
+    } else if ((bits) == 64) {                                  \
+      typedef double DType;                                     \
+      { __VA_ARGS__ }                                           \
+    } else {                                                    \
+      LOG(FATAL) << "Data type not renogized with bits " << bits; \
+    }                                                           \
+  } while (0)
+
 }  // namespace op

 }  // namespace cpu

--- a/src/array/cuda/atomic.cuh
+++ b/src/array/cuda/atomic.cuh
@@ -7,9 +7,9 @@
 #define DGL_ARRAY_CUDA_ATOMIC_H_

 #include <cuda_runtime.h>
-#if __CUDA_ARCH__ >= 600
-#include <cuda_fp16.h>
-#endif
+#include <cassert>
+#include "fp16.cuh"
+

 namespace dgl {
 namespace aten {
@@ -18,6 +18,10 @@ namespace cuda {
 // Type trait for selecting code type
 template <int Bytes> struct Code { };

+template <> struct Code<2> {
+  typedef unsigned short int Type;
+};
+
 template <> struct Code<4> {
  typedef unsigned int Type;
 };
@@ -37,6 +41,18 @@ template <typename T> struct Cast {
  }
 };

+#ifdef USE_FP16
+template <> struct Cast<half> {
+  typedef Code<sizeof(half)>::Type Type;
+  static __device__ __forceinline__ Type Encode(half val) {
+    return __half_as_ushort(val);
+  }
+  static __device__ __forceinline__ half Decode(Type code) {
+    return __ushort_as_half(code);
+  }
+};
+#endif
+
 template <> struct Cast<float> {
  typedef Code<sizeof(float)>::Type Type;
  static __device__ __forceinline__ Type Encode(float val) {
@@ -57,6 +73,18 @@ template <> struct Cast<double> {
  }
 };

+static __device__ __forceinline__ unsigned short int atomicCASshort(
+    unsigned short int *address,
+    unsigned short int compare,
+    unsigned short int val) {
+#if (defined(CUDART_VERSION) && (CUDART_VERSION > 10000))
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__) >= 700)
+  return atomicCAS(address, compare, val);
+#endif  // (defined(__CUDA_ARCH__) && (__CUDA_ARCH__) >= 700)
+#endif  // (defined(CUDART_VERSION) && (CUDART_VERSION > 10000))
+  return val;
+}
+
 #define DEFINE_ATOMIC(NAME) \
  template <typename T>                                          \
  __device__ __forceinline__ T Atomic##NAME(T* addr, T val) {    \
@@ -72,51 +100,70 @@ template <> struct Cast<double> {
    return Cast<T>::Decode(old);                                 \
  }

+#define DEFINE_ATOMIC_HALF(NAME) \
+  template <>                                                    \
+  __device__ __forceinline__ half Atomic##NAME<half>(half* addr, half val) {  \
+    typedef unsigned short int CT;                               \
+    CT* addr_as_ui = reinterpret_cast<CT*>(addr);                \
+    CT old = *addr_as_ui;                                        \
+    CT assumed = old;                                            \
+    do {                                                         \
+      assumed = old;                                             \
+      old = atomicCASshort(addr_as_ui, assumed,                  \
+          Cast<half>::Encode(OP(val, Cast<half>::Decode(old)))); \
+    } while (assumed != old);                                    \
+    return Cast<half>::Decode(old);                              \
+  }
+
 #define OP(a, b) max(a, b)
 DEFINE_ATOMIC(Max)
+#ifdef USE_FP16
+DEFINE_ATOMIC_HALF(Max)
+#endif  // USE_FP16
 #undef OP

 #define OP(a, b) min(a, b)
 DEFINE_ATOMIC(Min)
+#ifdef USE_FP16
+DEFINE_ATOMIC_HALF(Min)
+#endif  // USE_FP16
 #undef OP

 #define OP(a, b) a + b
 DEFINE_ATOMIC(Add)
 #undef OP

-#if __CUDA_ARCH__ >= 200
 template <>
 __device__ __forceinline__ float AtomicAdd<float>(float* addr, float val) {
+#if __CUDA_ARCH__ >= 200
  return atomicAdd(addr, val);
-}
+#else
+  return *addr + val;
 #endif  // __CUDA_ARCH__
+}

-#if __CUDA_ARCH__ >= 600
 template <>
 __device__ __forceinline__ double AtomicAdd<double>(double* addr, double val) {
+#if __CUDA_ARCH__ >= 600
  return atomicAdd(addr, val);
-}
+#else
+  return *addr + val;
 #endif
+}

+#ifdef USE_FP16
 #if defined(CUDART_VERSION) && CUDART_VERSION >= 10000
-#if __CUDA_ARCH__ >= 600
 template <>
-__device__ __forceinline__ __half2 AtomicAdd<__half2>(__half2* addr, __half2 val) {
-  return atomicAdd(addr, val);
-}
-#endif  // __CUDA_ARCH__
-
+__device__ __forceinline__ half AtomicAdd<half>(half* addr, half val) {
 #if __CUDA_ARCH__ >= 700
-template <>
-__device__ __forceinline__ __half AtomicAdd<__half>(__half* addr, __half val) {
  return atomicAdd(addr, val);
-}
+#else
+  return *addr + val;
 #endif  // __CUDA_ARCH__
-#endif
+}
+#endif  // defined(CUDART_VERSION) && CUDART_VERSION >= 10000
+#endif  // USE_FP16

-#define OP(a, b) a * b
-DEFINE_ATOMIC(Mul)
-#undef OP

 }  // namespace cuda
 }  // namespace aten

--- a/src/array/cuda/fp16.cuh
+++ b/src/array/cuda/fp16.cuh
+/*!
+ *  Copyright (c) 2020 by Contributors
+ * \file array/cuda/fp16.cuh
+ * \brief float16 related functions.
+ * \note this file is modified from TVM project:
+ *       https://github.com/apache/tvm/blob/e561007f0c330e3d14c2bc8a3ef40fb741db9004/src/target/source/literal/cuda_half_t.h.
+ */
+#ifndef DGL_ARRAY_FP16_CUH_
+#define DGL_ARRAY_FP16_CUH_
+
+
+#ifdef USE_FP16
+#include <cuda_fp16.h>
+
+static __device__ __forceinline__ half max(half a, half b)
+{
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hgt(__half(a), __half(b)) ? a : b;
+#else
+  return a;
+#endif
+}
+
+static __device__ __forceinline__ half min(half a, half b)
+{
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hlt(__half(a), __half(b)) ? a : b;
+#else
+  return a;
+#endif
+}
+#endif  // USE_FP16
+
+#endif  // DGL_ARRAY_FP16_CUH_
--- a/src/array/cuda/functor.cuh
+++ b/src/array/cuda/functor.cuh
@@ -7,6 +7,8 @@
 #define DGL_ARRAY_CUDA_FUNCTOR_CUH_

 #include "./atomic.cuh"
+#include "./fp16.cuh"
+#include <cmath>

 namespace dgl {
 namespace aten {
@@ -122,9 +124,11 @@ template <typename DType> constexpr bool Dot<DType>::reduce_last_dim;
 namespace reduce {
 template <typename Idx,
          typename DType,
-          bool atomic=false>
-struct Sum {
-  static constexpr DType zero = 0;
+          bool atomic>
+struct _Sum {
+  static constexpr __host__ __device__ __forceinline__ DType zero() {
+    return 0.;
+  };
  static constexpr bool require_arg = false;
  static __device__ __forceinline__ void Call(
    DType *out_buf, Idx *arg_u_buf, Idx *arg_e_buf,
@@ -148,16 +152,28 @@ struct Sum {
    Idx *arg_u_buf, Idx *arg_e_buf,
    DType val, DType val_ref, Idx uid, Idx eid) {}
 };
-template <typename Idx, typename DType, bool atomic>
-constexpr DType Sum<Idx, DType, atomic>::zero;
-template <typename Idx, typename DType, bool atomic>
-constexpr bool Sum<Idx, DType, atomic>::require_arg;

 template <typename Idx,
          typename DType,
          bool atomic=false>
-struct Max {
-  static constexpr DType zero = -std::numeric_limits<DType>::infinity();
+struct Sum: _Sum<Idx, DType, atomic> { };
+
+#ifdef USE_FP16
+template <typename Idx, bool atomic>
+struct Sum<Idx, half, atomic>: _Sum<Idx, half, atomic> {
+  static constexpr __host__ __device__ __forceinline__ half zero() {
+    return __float2half_rn(0.);
+  };
+};
+#endif  // USE_FP16
+
+template <typename Idx,
+          typename DType,
+          bool atomic>
+struct _Max {
+  static constexpr __host__ __device__ __forceinline__ DType zero() {
+    return -std::numeric_limits<DType>::infinity();
+  };
  static constexpr bool require_arg = true;
  static __device__ __forceinline__ void Call(
    DType *out_buf, Idx *arg_u_buf, Idx *arg_e_buf,
@@ -197,16 +213,29 @@ struct Max {
    }
  }
 };
-template <typename Idx, typename DType, bool atomic>
-constexpr DType Max<Idx, DType, atomic>::zero;
-template <typename Idx, typename DType, bool atomic>
-constexpr bool Max<Idx, DType, atomic>::require_arg;

 template <typename Idx,
          typename DType,
          bool atomic=false>
-struct Min {
-  static constexpr DType zero = std::numeric_limits<DType>::infinity();
+struct Max : _Max<Idx, DType, atomic> { };
+
+#ifdef USE_FP16
+template <typename Idx,
+          bool atomic>
+struct Max<Idx, half, atomic> : _Max<Idx, half, atomic> {
+  static constexpr __host__ __device__ __forceinline__ half zero() {
+    return __float2half_rn(-6.550400e+04f);
+  };
+};
+#endif
+
+template <typename Idx,
+          typename DType,
+          bool atomic>
+struct _Min {
+  static constexpr __host__ __device__ __forceinline__ DType zero() {
+    return std::numeric_limits<DType>::infinity();
+  };
  static constexpr bool require_arg = true;
  static __device__ __forceinline__ void Call(
    DType *out_buf, Idx *arg_u_buf, Idx *arg_e_buf,
@@ -246,10 +275,21 @@ struct Min {
    }
  }
 };
-template <typename Idx, typename DType, bool atomic>
-constexpr DType Min<Idx, DType, atomic>::zero;
-template <typename Idx, typename DType, bool atomic>
-constexpr bool Min<Idx, DType, atomic>::require_arg;
+
+template <typename Idx,
+          typename DType,
+          bool atomic=false>
+struct Min : _Min<Idx, DType, atomic> { };
+
+#ifdef USE_FP16
+template <typename Idx,
+          bool atomic>
+struct Min<Idx, half, atomic> : _Min<Idx, half, atomic> {
+  static constexpr __host__ __device__ __forceinline__ half zero() {
+    return __float2half_rn(6.550400e+04f);
+  };
+};
+#endif  // USE_FP16

 }  // namespace reduce


--- a/src/array/cuda/sddmm.cu
+++ b/src/array/cuda/sddmm.cu
@@ -73,7 +73,7 @@ namespace aten {
 /*!
 * \brief CUDA implementation of g-SDDMM on Csr format.
 */
-template <int XPU, typename IdType, typename DType>
+template <int XPU, typename IdType, int bits>
 void SDDMMCsr(const std::string& op,
              const BcastOff& bcast,
              const CSRMatrix& csr,
@@ -82,17 +82,19 @@ void SDDMMCsr(const std::string& op,
              NDArray out,
              int lhs_target,
              int rhs_target) {
+  SWITCH_BITS(bits, DType, {
    SWITCH_OP(op, Op, {
      SWITCH_TARGET(lhs_target, rhs_target, LhsTarget, RhsTarget, {
        cuda::SDDMMCsr<IdType, DType, Op, LhsTarget, RhsTarget>(bcast, csr, lhs, rhs, out);
      });
    });
+  });
 }

 /*!
 * \brief CUDA implementation of g-SDDMM on Coo format.
 */
-template <int XPU, typename IdType, typename DType>
+template <int XPU, typename IdType, int bits>
 void SDDMMCoo(const std::string& op,
              const BcastOff& bcast,
              const COOMatrix& coo,
@@ -101,43 +103,61 @@ void SDDMMCoo(const std::string& op,
              NDArray out,
              int lhs_target,
              int rhs_target) {
+  SWITCH_BITS(bits, DType, {
    SWITCH_OP(op, Op, {
      SWITCH_TARGET(lhs_target, rhs_target, LhsTarget, RhsTarget, {
        cuda::SDDMMCoo<IdType, DType, Op, LhsTarget, RhsTarget>(bcast, coo, lhs, rhs, out);
      });
    });
+  });
 }

-template void SDDMMCsr<kDLGPU, int32_t, float>(
+template void SDDMMCsr<kDLGPU, int32_t, 16>(
+    const std::string& op, const BcastOff& bcast, const CSRMatrix& csr,
+    NDArray lhs, NDArray rhs, NDArray out,
+    int lhs_target, int rhs_target);
+template void SDDMMCsr<kDLGPU, int64_t, 16>(
+    const std::string& op, const BcastOff& bcast, const CSRMatrix& csr,
+    NDArray lhs, NDArray rhs, NDArray out,
+    int lhs_target, int rhs_target);
+template void SDDMMCsr<kDLGPU, int32_t, 32>(
    const std::string& op, const BcastOff& bcast, const CSRMatrix& csr,
    NDArray lhs, NDArray rhs, NDArray out,
    int lhs_target, int rhs_target);
-template void SDDMMCsr<kDLGPU, int64_t, float>(
+template void SDDMMCsr<kDLGPU, int64_t, 32>(
    const std::string& op, const BcastOff& bcast, const CSRMatrix& csr,
    NDArray lhs, NDArray rhs, NDArray out,
    int lhs_target, int rhs_target);
-template void SDDMMCsr<kDLGPU, int32_t, double>(
+template void SDDMMCsr<kDLGPU, int32_t, 64>(
    const std::string& op, const BcastOff& bcast, const CSRMatrix& csr,
    NDArray lhs, NDArray rhs, NDArray out,
    int lhs_target, int rhs_target);
-template void SDDMMCsr<kDLGPU, int64_t, double>(
+template void SDDMMCsr<kDLGPU, int64_t, 64>(
    const std::string& op, const BcastOff& bcast, const CSRMatrix& csr,
    NDArray lhs, NDArray rhs, NDArray out,
    int lhs_target, int rhs_target);

-template void SDDMMCoo<kDLGPU, int32_t, float>(
+template void SDDMMCoo<kDLGPU, int32_t, 16>(
+    const std::string& op, const BcastOff& bcast, const COOMatrix& coo,
+    NDArray lhs, NDArray rhs, NDArray out,
+    int lhs_target, int rhs_target);
+template void SDDMMCoo<kDLGPU, int64_t, 16>(
+    const std::string& op, const BcastOff& bcast, const COOMatrix& coo,
+    NDArray lhs, NDArray rhs, NDArray out,
+    int lhs_target, int rhs_target);
+template void SDDMMCoo<kDLGPU, int32_t, 32>(
    const std::string& op, const BcastOff& bcast, const COOMatrix& coo,
    NDArray lhs, NDArray rhs, NDArray out,
    int lhs_target, int rhs_target);
-template void SDDMMCoo<kDLGPU, int64_t, float>(
+template void SDDMMCoo<kDLGPU, int64_t, 32>(
    const std::string& op, const BcastOff& bcast, const COOMatrix& coo,
    NDArray lhs, NDArray rhs, NDArray out,
    int lhs_target, int rhs_target);
-template void SDDMMCoo<kDLGPU, int32_t, double>(
+template void SDDMMCoo<kDLGPU, int32_t, 64>(
    const std::string& op, const BcastOff& bcast, const COOMatrix& coo,
    NDArray lhs, NDArray rhs, NDArray out,
    int lhs_target, int rhs_target);
-template void SDDMMCoo<kDLGPU, int64_t, double>(
+template void SDDMMCoo<kDLGPU, int64_t, 64>(
    const std::string& op, const BcastOff& bcast, const COOMatrix& coo,
    NDArray lhs, NDArray rhs, NDArray out,
    int lhs_target, int rhs_target);

--- a/src/array/cuda/sddmm.cuh
+++ b/src/array/cuda/sddmm.cuh
@@ -10,6 +10,7 @@
 #include "macro.cuh"
 #include "atomic.cuh"
 #include "functor.cuh"
+#include "fp16.cuh"
 #include "./utils.h"
 #include "../selector.h"
 #include "../../runtime/cuda/cuda_common.h"
@@ -105,7 +106,7 @@ __global__ void SDDMMCooTreeReduceKernel(
    for (int i = blockIdx.y; i < out_len; i += gridDim.y) {  // over output feature dimension
      const Idx lhs_add = UseBcast ? __ldg(lhs_off + i) : i;
      const Idx rhs_add = UseBcast ? __ldg(rhs_off + i) : i;
-      DType val = 0.;
+      DType val = reduce::Sum<Idx, DType>::zero();;
      for (int j = tx; j < reduce_size; j += 64) {
        val += lhsoff[lhs_add * reduce_size + j] * rhsoff[rhs_add * reduce_size + j];
        if (j + 32 < reduce_size)