[Performance][Optimizer] Enable using UVA and FP16 with SparseAdam Optimizer (#3885)

* Add uva by default to embedding * More updates * Update optimizer * Add new uva functions * Expose new pinned memory function * Add unit tests * Update formatting * Fix unit test * Handle auto UVA case when training is on CPU * Allow per-embedding decisions for whether to use UVA * Address spares_optim.py comments * Remove unused templates * Update unit test * Use dgl allocate memory for pinning * allow automatically unpin * workaround for d2h copy with a different dtype * fix linting * update error message * update copyright Co-authored-by: Xin Yao <xiny@nvidia.com> Co-authored-by: Minjie Wang <wmjlyjemaine@gmail.com>

[Performance][Optimizer] Enable using UVA and FP16 with SparseAdam Optimizer (#3885)
* Add uva by default to embedding * More updates * Update optimizer * Add new uva functions * Expose new pinned memory function * Add unit tests * Update formatting * Fix unit test * Handle auto UVA case when training is on CPU * Allow per-embedding decisions for whether to use UVA * Address spares_optim.py comments * Remove unused templates * Update unit test * Use dgl allocate memory for pinning * allow automatically unpin * workaround for d2h copy with a different dtype * fix linting * update error message * update copyright Co-authored-by: Xin Yao <xiny@nvidia.com> Co-authored-by: Minjie Wang <wmjlyjemaine@gmail.com>
020f0249 · nv-dlasalle · GitHub · 548c85ff · 020f0249 · 020f0249
Unverified Commit 020f0249 authored Jun 23, 2022 by nv-dlasalle Committed by GitHub Jun 24, 2022
7 changed files
--- a/python/dgl/optim/pytorch/sparse_optim.py
+++ b/python/dgl/optim/pytorch/sparse_optim.py
@@ -3,7 +3,9 @@ import abc
 from abc import abstractmethod
 import torch as th

-from ...utils import get_shared_mem_array, create_shared_mem_array
+from ...utils import get_shared_mem_array, create_shared_mem_array, \
+                     pin_memory_inplace, gather_pinned_tensor_rows, \
+                     scatter_pinned_tensor_rows
 from ...nn.pytorch import NodeEmbedding
 from ...cuda import nccl
 from ...partition import NDArrayPartition
@@ -434,7 +436,7 @@ class SparseAdagrad(SparseGradOptimizer):
                    state = th.empty(
                        emb.weight.shape,
                        dtype=th.float32,
-                        device=eth.device('cpu')).zero_()
+                        device=th.device('cpu')).zero_()
                elif self._rank == 0:
                    state = create_shared_mem_array(emb_name+'_state', \
                        emb.weight.shape, th.float32).zero_()
@@ -519,6 +521,16 @@ class SparseAdam(SparseGradOptimizer):
    eps : float, Optional
        The term added to the denominator to improve numerical stability
        Default: 1e-8
+    use_uva : bool, Optional
+        Whether to use pinned memory for storing 'mem' and 'power' parameters,
+        when the embedding is stored on the CPU. This will improve training
+        speed, but will require locking a large number of virtual memory pages.
+        For embeddings which are stored in GPU memory, this setting will have
+        no effect.
+        Default: True if the gradients are generated on the GPU, and False
+        if the gradients are on the CPU.
+    dtype : torch.dtype, Optional
+        The type to store optimizer state with. Default: th.float32.

    Examples
    --------
@@ -534,12 +546,26 @@ class SparseAdam(SparseGradOptimizer):
    ...     loss.backward()
    ...     optimizer.step()
    '''
-    def __init__(self, params, lr, betas=(0.9, 0.999), eps=1e-08):
+    def __init__(self, params, lr, betas=(0.9, 0.999), eps=1e-08, \
+                 use_uva=None, dtype=th.float32):
        super(SparseAdam, self).__init__(params, lr)
        self._lr = lr
        self._beta1 = betas[0]
        self._beta2 = betas[1]
        self._eps = eps
+        self._use_uva = use_uva
+        self._nd_handle = {}
+        self._is_using_uva = {}
+        assert dtype in [th.float16, th.float32], \
+            "Unsupported dtype {}. Valid choices are th.float32 " \
+            "and th.float32".format(dtype)
+        self._dtype = dtype
+
+    def _setup_uva(self, name, mem, power):
+        self._is_using_uva[name] = True
+        mem_nd = pin_memory_inplace(mem)
+        power_nd = pin_memory_inplace(power)
+        self._nd_handle[name] = [mem_nd, power_nd]

    def setup(self, params):
        # We need to register a state sum for each embedding in the kvstore.
@@ -547,28 +573,29 @@ class SparseAdam(SparseGradOptimizer):
            assert isinstance(emb, NodeEmbedding), \
                'SparseAdam only supports dgl.nn.NodeEmbedding'
            emb_name = emb.name
+            self._is_using_uva[emb_name] = self._use_uva
            if th.device(emb.emb_tensor.device) == th.device('cpu'):
                # if our embedding is on the CPU, our state also has to be
                if self._rank < 0:
                    state_step = th.empty(
                        (emb.weight.shape[0],),
-                        dtype=th.float32,
+                        dtype=th.int32,
                        device=th.device('cpu')).zero_()
                    state_mem = th.empty(
                        emb.weight.shape,
-                        dtype=th.float32,
+                        dtype=self._dtype,
                        device=th.device('cpu')).zero_()
                    state_power = th.empty(
                        emb.weight.shape,
-                        dtype=th.float32,
+                        dtype=self._dtype,
                        device=th.device('cpu')).zero_()
                elif self._rank == 0:
                    state_step = create_shared_mem_array(emb_name+'_step', \
-                        (emb.weight.shape[0],), th.float32).zero_()
+                        (emb.weight.shape[0],), th.int32).zero_()
                    state_mem = create_shared_mem_array(emb_name+'_mem', \
-                        emb.weight.shape, th.float32).zero_()
+                        emb.weight.shape, self._dtype).zero_()
                    state_power = create_shared_mem_array(emb_name+'_power', \
-                        emb.weight.shape, th.float32).zero_()
+                        emb.weight.shape, self._dtype).zero_()

                    if self._world_size > 1:
                        emb.store.set(emb_name+'_opt', emb_name)
@@ -576,24 +603,32 @@ class SparseAdam(SparseGradOptimizer):
                    # receive
                    emb.store.wait([emb_name+'_opt'])
                    state_step = get_shared_mem_array(emb_name+'_step', \
-                        (emb.weight.shape[0],), th.float32)
+                        (emb.weight.shape[0],), th.int32)
                    state_mem = get_shared_mem_array(emb_name+'_mem', \
-                        emb.weight.shape, th.float32)
+                        emb.weight.shape, self._dtype)
                    state_power = get_shared_mem_array(emb_name+'_power', \
-                        emb.weight.shape, th.float32)
+                        emb.weight.shape, self._dtype)
+
+                if self._is_using_uva[emb_name]:
+                    # if use_uva has been explicitly set to true, otherwise
+                    # wait until first step to decide
+                    self._setup_uva(emb_name, state_mem, state_power)
            else:
+                # make sure we don't use UVA when data is on the GPU
+                self._is_using_uva[emb_name] = False
+
                # distributed state on on gpu
                state_step = th.empty(
                    [emb.emb_tensor.shape[0]],
-                    dtype=th.float32,
+                    dtype=th.int32,
                    device=emb.emb_tensor.device).zero_()
                state_mem = th.empty(
                    emb.emb_tensor.shape,
-                    dtype=th.float32,
+                    dtype=self._dtype,
                    device=emb.emb_tensor.device).zero_()
                state_power = th.empty(
                    emb.emb_tensor.shape,
-                    dtype=th.float32,
+                    dtype=self._dtype,
                    device=emb.emb_tensor.device).zero_()
            state = (state_step, state_mem, state_power)
            emb.set_optm_state(state)
@@ -613,20 +648,34 @@ class SparseAdam(SparseGradOptimizer):
            Sparse embedding to update.
        """
        with th.no_grad():
-            beta1 = self._beta1
-            beta2 = self._beta2
-            eps = self._eps
-
-            clr = self._lr
            state_step, state_mem, state_power = emb.optm_state
+            exec_dtype = grad.dtype
            exec_dev = grad.device
            state_dev = state_step.device

+            # whether or not we need to transfer data from the GPU to the CPU
+            # while updating the weights
+            is_d2h = state_dev.type == 'cpu' and exec_dev.type == 'cuda'
+
            # only perform async copies cpu -> gpu, or gpu-> gpu, but block
            # when copying to the cpu, so as to ensure the copy is finished
            # before operating on the data on the cpu
-            state_block = state_dev == th.device('cpu') and exec_dev != state_dev
+            state_block = is_d2h
+
+            if self._is_using_uva[emb.name] is None and is_d2h:
+                # we should use UVA going forward
+                self._setup_uva(emb.name, state_mem, state_power)
+            elif self._is_using_uva[emb.name] is None:
+                # we shouldn't use UVA going forward
+                self._is_using_uva[emb.name] = False
+
+            use_uva = self._is_using_uva[emb.name]
+
+            beta1 = self._beta1
+            beta2 = self._beta2
+            eps = self._eps

+            clr = self._lr
            # There can be duplicated indices due to sampling.
            # Thus unique them here and average the gradient here.
            grad_indices, inverse, cnt = th.unique(idx,
@@ -635,8 +684,16 @@ class SparseAdam(SparseGradOptimizer):
            state_idx = grad_indices.to(state_dev)
            state_step[state_idx] += 1
            state_step = state_step[state_idx].to(exec_dev)
+
+            if use_uva:
+                orig_mem = gather_pinned_tensor_rows(state_mem, grad_indices)
+                orig_power = gather_pinned_tensor_rows(state_power, grad_indices)
+            else:
                orig_mem = state_mem[state_idx].to(exec_dev)
                orig_power = state_power[state_idx].to(exec_dev)
+            # convert to exec dtype
+            orig_mem = orig_mem.to(dtype=exec_dtype)
+            orig_power = orig_power.to(dtype=exec_dtype)

            grad_values = th.zeros((grad_indices.shape[0], grad.shape[1]), device=exec_dev)
            grad_values.index_add_(0, inverse, grad)
@@ -647,8 +704,19 @@ class SparseAdam(SparseGradOptimizer):

            update_mem = beta1 * orig_mem + (1.-beta1) * grad_mem
            update_power = beta2 * orig_power + (1.-beta2) * grad_power
-            update_mem_dst = update_mem.to(state_dev, non_blocking=True)
-            update_power_dst = update_power.to(state_dev, non_blocking=True)
+
+            if use_uva:
+                scatter_pinned_tensor_rows(state_mem, \
+                                           grad_indices, \
+                                           update_mem.to(dtype=self._dtype))
+                scatter_pinned_tensor_rows(state_power, \
+                                           grad_indices, \
+                                           update_power.to(dtype=self._dtype))
+            else:
+                update_mem_dst = update_mem.to(dtype=self._dtype).to(
+                    state_dev, non_blocking=True)
+                update_power_dst = update_power.to(dtype=self._dtype).to(
+                    state_dev, non_blocking=True)
                if state_block:
                    # use events to try and overlap CPU and GPU as much as possible
                    update_event = th.cuda.Event()
@@ -664,6 +732,9 @@ class SparseAdam(SparseGradOptimizer):
            if state_block:
                std_event = th.cuda.Event()
                std_event.record()
+
+            if not use_uva:
+                if state_block:
                    # wait for our transfers from exec_dev to state_dev to finish
                    # before we can use them
                    update_event.wait()

--- a/python/dgl/utils/pin_memory.py
+++ b/python/dgl/utils/pin_memory.py
@@ -49,4 +49,21 @@ def gather_pinned_tensor_rows(tensor, rows):
    """
    return F.from_dgl_nd(_CAPI_DGLIndexSelectCPUFromGPU(F.to_dgl_nd(tensor), F.to_dgl_nd(rows)))

+def scatter_pinned_tensor_rows(dest, rows, source):
+    """Directly scatter rows from a GPU tensor given an indices array on CUDA devices,
+    to a pinned tensor on the CPU.
+
+    Parameters
+    ----------
+    dest : Tensor
+        The tensor on the CPU to scatter rows to. Must be in pinned memory.
+    rows : Tensor
+        The rows to scatter. Must be a CUDA tensor with unique entries.
+    source : Tensor
+        The tensor on the GPU to scatter rows from.
+    """
+    _CAPI_DGLIndexScatterGPUToCPU(F.to_dgl_nd(dest), F.to_dgl_nd(rows),
+        F.to_dgl_nd(source))
+
+
 _init_api("dgl.ndarray.uvm", __name__)
--- a/src/array/cuda/array_index_select.cuh
+++ b/src/array/cuda/array_index_select.cuh
 /*!
- *  Copyright (c) 2021 by Contributors
- * \file array/cpu/array_index_select.cuh
+ *  Copyright (c) 2021-2022 by Contributors
+ * \file array/cuda/array_index_select.cuh
 * \brief Array index select GPU kernel implementation
 */

@@ -50,6 +50,45 @@ __global__ void IndexSelectMultiKernel(
  }
 }

+template <typename DType, typename IdType>
+__global__ void IndexScatterSingleKernel(const DType* array,
+                                         const IdType* index,
+                                         const int64_t length,
+                                         const int64_t arr_len,
+                                         DType* out) {
+  int tx = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride_x = gridDim.x * blockDim.x;
+  while (tx < length) {
+    assert(index[tx] >= 0 && index[tx] < arr_len);
+    out[index[tx]] = array[tx];
+    tx += stride_x;
+  }
+}
+
+template <typename DType, typename IdType>
+__global__ void IndexScatterMultiKernel(
+        const DType* const array,
+        const int64_t num_feat,
+        const IdType* const index,
+        const int64_t length,
+        const int64_t arr_len,
+        DType* const out) {
+  int64_t in_row = blockIdx.x*blockDim.y+threadIdx.y;
+
+  const int64_t stride = blockDim.y*gridDim.x;
+
+  while (in_row < length) {
+    int64_t col = threadIdx.x;
+    const int64_t out_row = index[in_row];
+    assert(out_row >= 0 && out_row < arr_len);
+    while (col < num_feat) {
+      out[out_row*num_feat+col] = array[in_row*num_feat+col];
+      col += blockDim.x;
+    }
+    in_row += stride;
+  }
+}
+
 }  // namespace impl
 }  // namespace aten
 }  // namespace dgl

--- a/src/array/cuda/uvm/array_index_select_uvm.cu
+++ b/src/array/cuda/uvm/array_index_select_uvm.cu
 /*!
- *  Copyright (c) 2019 by Contributors
- * \file array/cpu/array_index_select_uvm.cu
+ *  Copyright (c) 2019-2022 by Contributors
+ * \file array/cuda/uvm/array_index_select_uvm.cu
 * \brief Array index select GPU implementation
 */
 #include <dgl/array.h>
@@ -62,6 +62,7 @@ NDArray IndexSelectCPUFromGPU(NDArray array, IdArray index) {
  return ret;
 }

+// floating point types are treated as their equal width integer types
 template NDArray IndexSelectCPUFromGPU<int8_t, int32_t>(NDArray, IdArray);
 template NDArray IndexSelectCPUFromGPU<int8_t, int64_t>(NDArray, IdArray);
 template NDArray IndexSelectCPUFromGPU<int16_t, int32_t>(NDArray, IdArray);
@@ -70,10 +71,57 @@ template NDArray IndexSelectCPUFromGPU<int32_t, int32_t>(NDArray, IdArray);
 template NDArray IndexSelectCPUFromGPU<int32_t, int64_t>(NDArray, IdArray);
 template NDArray IndexSelectCPUFromGPU<int64_t, int32_t>(NDArray, IdArray);
 template NDArray IndexSelectCPUFromGPU<int64_t, int64_t>(NDArray, IdArray);
-template NDArray IndexSelectCPUFromGPU<float, int32_t>(NDArray, IdArray);
-template NDArray IndexSelectCPUFromGPU<float, int64_t>(NDArray, IdArray);
-template NDArray IndexSelectCPUFromGPU<double, int32_t>(NDArray, IdArray);
-template NDArray IndexSelectCPUFromGPU<double, int64_t>(NDArray, IdArray);
+
+
+template<typename DType, typename IdType>
+void IndexScatterGPUToCPU(NDArray dest, IdArray index, NDArray source) {
+  auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
+  DType* dest_data = static_cast<DType*>(dest->data);
+  const DType* source_data = static_cast<DType*>(source->data);
+  const IdType* idx_data = static_cast<IdType*>(index->data);
+  const int64_t arr_len = dest->shape[0];
+  const int64_t len = index->shape[0];
+  int64_t num_feat = 1;
+  std::vector<int64_t> shape{len};
+
+  CHECK(dest.IsPinned());
+  CHECK_EQ(index->ctx.device_type, kDLGPU);
+  CHECK_EQ(source->ctx.device_type, kDLGPU);
+
+  for (int d = 1; d < source->ndim; ++d) {
+    num_feat *= source->shape[d];
+  }
+
+  if (len == 0)
+    return;
+
+  if (num_feat == 1) {
+      const int nt = cuda::FindNumThreads(len);
+      const int nb = (len + nt - 1) / nt;
+      CUDA_KERNEL_CALL(IndexScatterSingleKernel, nb, nt, 0,
+          thr_entry->stream, source_data, idx_data, len, arr_len, dest_data);
+  } else {
+      dim3 block(256, 1);
+      while (static_cast<int64_t>(block.x) >= 2*num_feat) {
+          block.x /= 2;
+          block.y *= 2;
+      }
+      const dim3 grid((len+block.y-1)/block.y);
+      CUDA_KERNEL_CALL(IndexScatterMultiKernel, grid, block, 0,
+          thr_entry->stream, source_data, num_feat, idx_data,
+          len, arr_len, dest_data);
+  }
+}
+
+// floating point types are treated as their equal width integer types
+template void IndexScatterGPUToCPU<int8_t, int32_t>(NDArray, IdArray, NDArray);
+template void IndexScatterGPUToCPU<int8_t, int64_t>(NDArray, IdArray, NDArray);
+template void IndexScatterGPUToCPU<int16_t, int32_t>(NDArray, IdArray, NDArray);
+template void IndexScatterGPUToCPU<int16_t, int64_t>(NDArray, IdArray, NDArray);
+template void IndexScatterGPUToCPU<int32_t, int32_t>(NDArray, IdArray, NDArray);
+template void IndexScatterGPUToCPU<int32_t, int64_t>(NDArray, IdArray, NDArray);
+template void IndexScatterGPUToCPU<int64_t, int32_t>(NDArray, IdArray, NDArray);
+template void IndexScatterGPUToCPU<int64_t, int64_t>(NDArray, IdArray, NDArray);

 }  // namespace impl
 }  // namespace aten

--- a/src/array/uvm_array.cc
+++ b/src/array/uvm_array.cc
 /*!
- *  Copyright (c) 2019 by Contributors
+ *  Copyright (c) 2019-2022 by Contributors
 * \file array/uvm_array.cc
 * \brief DGL array utilities implementation
 */
@@ -15,24 +15,42 @@ namespace aten {

 NDArray IndexSelectCPUFromGPU(NDArray array, IdArray index) {
 #ifdef DGL_USE_CUDA
-  CHECK(array.IsPinned())
-    << "Only the CPUPinned device type input array is supported";
-  CHECK_EQ(index->ctx.device_type, kDLGPU)
-    << "Only the GPU device type input index is supported";
+  CHECK(array.IsPinned()) << "Input array must be in pinned memory.";
+  CHECK_EQ(index->ctx.device_type, kDLGPU) << "Index must be on the GPU.";
+  CHECK_GE(array->ndim, 1) << "Input array must have at least 1 dimension.";
+  CHECK_EQ(index->ndim, 1) << "Index must be a 1D array.";

-  CHECK_GE(array->ndim, 1) << "Only support array with at least 1 dimension";
-  CHECK_EQ(index->ndim, 1) << "Index array must be an 1D array.";
  ATEN_DTYPE_BITS_ONLY_SWITCH(array->dtype, DType, "values", {
    ATEN_ID_TYPE_SWITCH(index->dtype, IdType, {
      return impl::IndexSelectCPUFromGPU<DType, IdType>(array, index);
    });
  });
 #endif
-  LOG(FATAL) << "IndexSelectCPUFromGPU requires CUDA";
+  LOG(FATAL) << "IndexSelectCPUFromGPU requires CUDA.";
  // Should be unreachable
  return NDArray{};
 }

+void IndexScatterGPUToCPU(NDArray dest, IdArray index, NDArray source) {
+#ifdef DGL_USE_CUDA
+  CHECK(dest.IsPinned())  << "Destination array must be in pinned memory.";
+  CHECK_EQ(index->ctx.device_type, kDLGPU) << "Index must be on the GPU.";
+  CHECK_EQ(source->ctx.device_type, kDLGPU) << "Source array must be on the GPU.";
+  CHECK_EQ(dest->dtype, source->dtype) << "Destination array and source "
+      "array must have the same dtype.";
+  CHECK_GE(dest->ndim, 1) << "Destination array must have at least 1 dimension.";
+  CHECK_EQ(index->ndim, 1) << "Index must be a 1D array.";
+
+  ATEN_DTYPE_BITS_ONLY_SWITCH(source->dtype, DType, "values", {
+    ATEN_ID_TYPE_SWITCH(index->dtype, IdType, {
+      impl::IndexScatterGPUToCPU<DType, IdType>(dest, index, source);
+    });
+  });
+#else
+  LOG(FATAL) << "IndexScatterGPUToCPU requires CUDA.";
+#endif
+}
+
 DGL_REGISTER_GLOBAL("ndarray.uvm._CAPI_DGLIndexSelectCPUFromGPU")
 .set_body([] (DGLArgs args, DGLRetValue* rv) {
    NDArray array = args[0];
@@ -40,5 +58,15 @@ DGL_REGISTER_GLOBAL("ndarray.uvm._CAPI_DGLIndexSelectCPUFromGPU")
    *rv = IndexSelectCPUFromGPU(array, index);
  });

+DGL_REGISTER_GLOBAL("ndarray.uvm._CAPI_DGLIndexScatterGPUToCPU")
+.set_body([] (DGLArgs args, DGLRetValue* rv) {
+    NDArray dest = args[0];
+    IdArray index = args[1];
+    NDArray source = args[2];
+    IndexScatterGPUToCPU(dest, index, source);
+  });
+
+
+
 }  // namespace aten
 }  // namespace dgl
--- a/src/array/uvm_array_op.h
+++ b/src/array/uvm_array_op.h
 /*!
- *  Copyright (c) 2019 by Contributors
- * \file array/array_op.h
+ *  Copyright (c) 2019-2022 by Contributors
+ * \file array/uvm_array_op.h
 * \brief Array operator templates
 */
 #ifndef DGL_ARRAY_UVM_ARRAY_OP_H_
@@ -17,6 +17,9 @@ namespace impl {
 template <typename DType, typename IdType>
 NDArray IndexSelectCPUFromGPU(NDArray array, IdArray index);

+template <typename DType, typename IdType>
+void IndexScatterGPUToCPU(NDArray dest, IdArray index, NDArray source);
+
 }  // namespace impl
 }  // namespace aten
 }  // namespace dgl

--- a/tests/pytorch/test_optim.py
+++ b/tests/pytorch/test_optim.py
@@ -10,9 +10,9 @@ from dgl.nn import NodeEmbedding
 from dgl.optim import SparseAdam, SparseAdagrad

 @unittest.skipIf(os.name == 'nt', reason='Do not support windows yet')
-def test_sparse_adam():
+@pytest.mark.parametrize('emb_dim', [1, 4, 101, 1024])
+def test_sparse_adam(emb_dim):
    num_embs = 10
-    emb_dim = 4
    device=F.ctx()
    dgl_emb = NodeEmbedding(num_embs, emb_dim, 'test')
    torch_emb = th.nn.Embedding(num_embs, emb_dim, sparse=True)
@@ -28,7 +28,50 @@ def test_sparse_adam():
    idx = th.randint(0, num_embs, size=(4,))
    dgl_value = dgl_emb(idx, device).to(th.device('cpu'))
    torch_value = torch_emb(idx)
-    labels = th.ones((4,)).long()
+    labels = th.zeros((4,)).long()
+    print("dgl_value = {}".format(dgl_value))
+    print("labels = {}".format(labels))
+
+    dgl_adam.zero_grad()
+    torch_adam.zero_grad()
+    dgl_loss = th.nn.functional.cross_entropy(dgl_value, labels)
+    torch_loss = th.nn.functional.cross_entropy(torch_value, labels)
+    dgl_loss.backward()
+    torch_loss.backward()
+
+    dgl_adam.step()
+    torch_adam.step()
+    assert F.allclose(dgl_emb.weight, torch_emb.weight)
+
+    # Can not test second step
+    # Pytorch sparseAdam maintains a global step
+    # DGL sparseAdam use a per embedding step
+
+@unittest.skipIf(os.name == 'nt', reason='Do not support windows yet')
+@pytest.mark.parametrize('use_uva', [False, True, None])
+@pytest.mark.parametrize('emb_dim', [1, 4, 101, 1024])
+def test_sparse_adam_uva(use_uva, emb_dim):
+    if F.ctx().type == 'cpu' and use_uva == True:
+        # we want to only test values of False and None when not using GPU
+        pytest.skip("UVA cannot be used without GPUs.")
+
+    num_embs = 10
+    device=F.ctx()
+    dgl_emb = NodeEmbedding(num_embs, emb_dim, 'test_uva{}'.format(use_uva))
+    torch_emb = th.nn.Embedding(num_embs, emb_dim, sparse=True)
+    th.manual_seed(0)
+    th.nn.init.uniform_(torch_emb.weight, 0, 1.0)
+    th.manual_seed(0)
+    th.nn.init.uniform_(dgl_emb.weight, 0, 1.0)
+
+    dgl_adam = SparseAdam(params=[dgl_emb], lr=0.01, use_uva=use_uva)
+    torch_adam = th.optim.SparseAdam(list(torch_emb.parameters()), lr=0.01)
+
+    # first step
+    idx = th.randint(0, num_embs, size=(4,))
+    dgl_value = dgl_emb(idx, device).to(th.device('cpu'))
+    torch_value = torch_emb(idx)
+    labels = th.zeros((4,)).long()

    dgl_adam.zero_grad()
    torch_adam.zero_grad()
@@ -45,6 +88,45 @@ def test_sparse_adam():
    # Pytorch sparseAdam maintains a global step
    # DGL sparseAdam use a per embedding step

+@unittest.skipIf(os.name == 'nt', reason='Do not support windows yet')
+@pytest.mark.parametrize('dtype', [th.float32, th.float16])
+@pytest.mark.parametrize('emb_dim', [1, 4, 101, 1024])
+def test_sparse_adam_dtype(dtype, emb_dim):
+    num_embs = 10
+    device=F.ctx()
+    dgl_emb = NodeEmbedding(num_embs, emb_dim, 'test_dtype{}'.format(dtype))
+    torch_emb = th.nn.Embedding(num_embs, emb_dim, sparse=True)
+    th.manual_seed(0)
+    th.nn.init.uniform_(torch_emb.weight, 0, 1.0)
+    th.manual_seed(0)
+    th.nn.init.uniform_(dgl_emb.weight, 0, 1.0)
+
+    dgl_adam = SparseAdam(params=[dgl_emb], lr=0.01, dtype=dtype)
+    torch_adam = th.optim.SparseAdam(list(torch_emb.parameters()), lr=0.01)
+
+    # first step
+    idx = th.randint(0, num_embs, size=(4,))
+    dgl_value = dgl_emb(idx, device).to(th.device('cpu'))
+    torch_value = torch_emb(idx)
+    labels = th.zeros((4,)).long()
+
+    dgl_adam.zero_grad()
+    torch_adam.zero_grad()
+    dgl_loss = th.nn.functional.cross_entropy(dgl_value, labels)
+    torch_loss = th.nn.functional.cross_entropy(torch_value, labels)
+    dgl_loss.backward()
+    torch_loss.backward()
+
+    dgl_adam.step()
+    torch_adam.step()
+    assert F.allclose(dgl_emb.weight, torch_emb.weight)
+
+    # Can not test second step
+    # Pytorch sparseAdam maintains a global step
+    # DGL sparseAdam use a per embedding step
+
+
+
 @unittest.skipIf(os.name == 'nt', reason='Do not support windows yet')
 def test_sparse_adam_zero_step():
    num_embs = 10
@@ -385,7 +467,10 @@ def test_multiprocess_sparse_adam_zero_step_cuda_tensor(num_workers):
    assert F.allclose(dgl_weight, torch_weight)

 if __name__ == '__main__':
-    test_sparse_adam()
+    test_sparse_adam(1)
+    test_sparse_adam(4)
+    test_sparse_adam(101)
+    test_sparse_adam(1024)
    test_sparse_adam_zero_step()

    test_multiprocess_cpu_sparse_adam(2)