[CUDA] Many CUDA operators; Prepare for DGLGraph on CUDA (#1660)

* add cuda utils; change g.to; add g.device * split array.h into several headers * cuda index select * file * three cuda kernels * add cuda elementwise arith and several others * cuda CSRIsNonZero * fix lint * lint * lint * fix bug in changing ctx to property * address comments * remove unused codes * address comments

[CUDA] Many CUDA operators; Prepare for DGLGraph on CUDA (#1660)
* add cuda utils; change g.to; add g.device * split array.h into several headers * cuda index select * file * three cuda kernels * add cuda elementwise arith and several others * cuda CSRIsNonZero * fix lint * lint * lint * fix bug in changing ctx to property * address comments * remove unused codes * address comments
f1b19a6b · Minjie Wang · GitHub · 42b0c38f · f1b19a6b · f1b19a6b
Unverified Commit f1b19a6b authored Jun 19, 2020 by Minjie Wang Committed by GitHub Jun 19, 2020
10 changed files
--- a/src/array/cpu/array_utils.h
+++ b/src/array/cpu/array_utils.h
@@ -12,14 +12,7 @@
 #include <utility>
 #include "../../c_api_common.h"

-#define CHECK_SAME_DTYPE(VAR1, VAR2)                                          \
-  CHECK(VAR1->dtype == VAR2->dtype)                                           \
-    << "Expected " << (#VAR2) << " to be the same type as " << (#VAR1) << "(" \
-    << (VAR1)->dtype << ")"                                                     \
-    << ". But got " << (VAR2)->dtype;
-
 namespace dgl {
-
 namespace aten {

 /*!
@@ -125,8 +118,7 @@ struct PairHash {
  }
 };

-};  // namespace aten
-
-};  // namespace dgl
+}  // namespace aten
+}  // namespace dgl

 #endif  // DGL_ARRAY_CPU_ARRAY_UTILS_H_
--- a/src/array/cpu/rowwise_pick.h
+++ b/src/array/cpu/rowwise_pick.h
@@ -8,6 +8,7 @@

 #include <dgl/array.h>
 #include <functional>
+#include <algorithm>

 namespace dgl {
 namespace aten {

--- a/src/array/cpu/spmat_op_impl.cc
+++ b/src/array/cpu/spmat_op_impl.cc
@@ -20,8 +20,6 @@ namespace impl {

 template <DLDeviceType XPU, typename IdType>
 bool CSRIsNonZero(CSRMatrix csr, int64_t row, int64_t col) {
-  CHECK(row >= 0 && row < csr.num_rows) << "Invalid row index: " << row;
-  CHECK(col >= 0 && col < csr.num_cols) << "Invalid col index: " << col;
  const IdType* indptr_data = static_cast<IdType*>(csr.indptr->data);
  const IdType* indices_data = static_cast<IdType*>(csr.indices->data);
  if (csr.sorted) {
@@ -43,8 +41,6 @@ template bool CSRIsNonZero<kDLCPU, int64_t>(CSRMatrix, int64_t, int64_t);

 template <DLDeviceType XPU, typename IdType>
 NDArray CSRIsNonZero(CSRMatrix csr, NDArray row, NDArray col) {
-  CHECK_SAME_DTYPE(csr.indices, row);
-  CHECK_SAME_DTYPE(csr.indices, col);
  const auto rowlen = row->shape[0];
  const auto collen = col->shape[0];
  const auto rstlen = std::max(rowlen, collen);
@@ -90,7 +86,6 @@ template bool CSRHasDuplicate<kDLCPU, int64_t>(CSRMatrix csr);

 template <DLDeviceType XPU, typename IdType>
 int64_t CSRGetRowNNZ(CSRMatrix csr, int64_t row) {
-  CHECK(row >= 0 && row < csr.num_rows) << "Invalid row index: " << row;
  const IdType* indptr_data = static_cast<IdType*>(csr.indptr->data);
  return indptr_data[row + 1] - indptr_data[row];
 }
@@ -120,7 +115,6 @@ template NDArray CSRGetRowNNZ<kDLCPU, int64_t>(CSRMatrix, NDArray);

 template <DLDeviceType XPU, typename IdType>
 NDArray CSRGetRowColumnIndices(CSRMatrix csr, int64_t row) {
-  CHECK(row >= 0 && row < csr.num_rows) << "Invalid row index: " << row;
  const int64_t len = impl::CSRGetRowNNZ<XPU, IdType>(csr, row);
  const IdType* indptr_data = static_cast<IdType*>(csr.indptr->data);
  const int64_t offset = indptr_data[row] * sizeof(IdType);
@@ -134,7 +128,6 @@ template NDArray CSRGetRowColumnIndices<kDLCPU, int64_t>(CSRMatrix, int64_t);

 template <DLDeviceType XPU, typename IdType>
 NDArray CSRGetRowData(CSRMatrix csr, int64_t row) {
-  CHECK(row >= 0 && row < csr.num_rows) << "Invalid row index: " << row;
  const int64_t len = impl::CSRGetRowNNZ<XPU, IdType>(csr, row);
  const IdType* indptr_data = static_cast<IdType*>(csr.indptr->data);
  const int64_t offset = indptr_data[row] * sizeof(IdType);
@@ -172,8 +165,6 @@ void CollectDataFromSorted(const IdType *indices_data, const IdType *data,

 template <DLDeviceType XPU, typename IdType>
 NDArray CSRGetData(CSRMatrix csr, int64_t row, int64_t col) {
-  CHECK(row >= 0 && row < csr.num_rows) << "Invalid row index: " << row;
-  CHECK(col >= 0 && col < csr.num_cols) << "Invalid col index: " << col;
  std::vector<IdType> ret_vec;
  const IdType* indptr_data = static_cast<IdType*>(csr.indptr->data);
  const IdType* indices_data = static_cast<IdType*>(csr.indices->data);
@@ -197,8 +188,6 @@ template NDArray CSRGetData<kDLCPU, int64_t>(CSRMatrix, int64_t, int64_t);

 template <DLDeviceType XPU, typename IdType>
 NDArray CSRGetData(CSRMatrix csr, NDArray rows, NDArray cols) {
-  CHECK_SAME_DTYPE(csr.indices, rows);
-  CHECK_SAME_DTYPE(csr.indices, cols);
  const int64_t rowlen = rows->shape[0];
  const int64_t collen = cols->shape[0];

@@ -266,8 +255,6 @@ void CollectDataIndicesFromSorted(const IdType *indices_data, const IdType *data

 template <DLDeviceType XPU, typename IdType>
 std::vector<NDArray> CSRGetDataAndIndices(CSRMatrix csr, NDArray rows, NDArray cols) {
-  CHECK_SAME_DTYPE(csr.indices, rows);
-  CHECK_SAME_DTYPE(csr.indices, cols);
  // TODO(minjie): more efficient implementation for matrix without duplicate entries
  const int64_t rowlen = rows->shape[0];
  const int64_t collen = cols->shape[0];

--- a/src/array/cuda/array_index_select.cu
+++ b/src/array/cuda/array_index_select.cu
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file array/cpu/array_index_select.cu
+ * \brief Array index select GPU implementation
+ */
+#include <dgl/array.h>
+#include "../../runtime/cuda/cuda_common.h"
+#include "../../cuda_utils.h"
+
+namespace dgl {
+using runtime::NDArray;
+namespace aten {
+namespace impl {
+
+template <typename DType, typename IdType>
+__global__ void _IndexSelectKernel(const DType* array, const IdType* index,
+                                   int64_t length, DType* out) {
+  int tx = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride_x = gridDim.x * blockDim.x;
+  while (tx < length) {
+    out[tx] = array[index[tx]];
+    tx += stride_x;
+  }
+}
+
+template<DLDeviceType XPU, typename DType, typename IdType>
+NDArray IndexSelect(NDArray array, IdArray index) {
+  auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
+  const DType* array_data = static_cast<DType*>(array->data);
+  const IdType* idx_data = static_cast<IdType*>(index->data);
+  const int64_t arr_len = array->shape[0];
+  const int64_t len = index->shape[0];
+  NDArray ret = NDArray::Empty({len}, array->dtype, array->ctx);
+  if (len == 0)
+    return ret;
+  DType* ret_data = static_cast<DType*>(ret->data);
+  const int nt = cuda::FindNumThreads(len);
+  const int nb = (len + nt - 1) / nt;
+  _IndexSelectKernel<<<nb, nt, 0, thr_entry->stream>>>(array_data, idx_data, len, ret_data);
+  return ret;
+}
+
+template NDArray IndexSelect<kDLGPU, int32_t, int32_t>(NDArray, IdArray);
+template NDArray IndexSelect<kDLGPU, int32_t, int64_t>(NDArray, IdArray);
+template NDArray IndexSelect<kDLGPU, int64_t, int32_t>(NDArray, IdArray);
+template NDArray IndexSelect<kDLGPU, int64_t, int64_t>(NDArray, IdArray);
+template NDArray IndexSelect<kDLGPU, float, int32_t>(NDArray, IdArray);
+template NDArray IndexSelect<kDLGPU, float, int64_t>(NDArray, IdArray);
+template NDArray IndexSelect<kDLGPU, double, int32_t>(NDArray, IdArray);
+template NDArray IndexSelect<kDLGPU, double, int64_t>(NDArray, IdArray);
+
+template <DLDeviceType XPU, typename DType>
+DType IndexSelect(NDArray array, uint64_t index) {
+  auto device = runtime::DeviceAPI::Get(array->ctx);
+  DType ret = 0;
+  device->CopyDataFromTo(
+      static_cast<DType*>(array->data) + index, 0, &ret, 0,
+      sizeof(DType), array->ctx, DLContext{kDLCPU, 0},
+      array->dtype, nullptr);
+  return ret;
+}
+
+template int32_t IndexSelect<kDLGPU, int32_t>(NDArray array, uint64_t index);
+template int64_t IndexSelect<kDLGPU, int64_t>(NDArray array, uint64_t index);
+template uint32_t IndexSelect<kDLGPU, uint32_t>(NDArray array, uint64_t index);
+template uint64_t IndexSelect<kDLGPU, uint64_t>(NDArray array, uint64_t index);
+template float IndexSelect<kDLGPU, float>(NDArray array, uint64_t index);
+template double IndexSelect<kDLGPU, double>(NDArray array, uint64_t index);
+
+}  // namespace impl
+}  // namespace aten
+}  // namespace dgl
--- a/src/array/cuda/array_op_impl.cu
+++ b/src/array/cuda/array_op_impl.cu
 /*!
- *  Copyright (c) 2019 by Contributors
+ *  Copyright (c) 2020 by Contributors
 * \file array/cuda/array_op_impl.cu
 * \brief Array operator GPU implementation
 */
 #include <dgl/array.h>
 #include "../../runtime/cuda/cuda_common.h"
+#include "../../cuda_utils.h"
+#include "../arith.h"

 namespace dgl {
 using runtime::NDArray;
 namespace aten {
 namespace impl {

-int FindNumThreads(int dim, int max_nthrs) {
-  int ret = max_nthrs;
-  while (ret > dim) {
-    ret = ret >> 1;
+///////////////////////////// BinaryElewise /////////////////////////////
+
+template <typename IdType, typename Op>
+__global__ void _BinaryElewiseKernel(
+    const IdType* lhs, const IdType* rhs, IdType* out, int64_t length) {
+  int tx = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride_x = gridDim.x * blockDim.x;
+  while (tx < length) {
+    out[tx] = Op::Call(lhs[tx], rhs[tx]);
+    tx += stride_x;
+  }
+}
+
+template <DLDeviceType XPU, typename IdType, typename Op>
+IdArray BinaryElewise(IdArray lhs, IdArray rhs) {
+  const int64_t len = lhs->shape[0];
+  IdArray ret = NewIdArray(lhs->shape[0], lhs->ctx, lhs->dtype.bits);
+  const IdType* lhs_data = static_cast<IdType*>(lhs->data);
+  const IdType* rhs_data = static_cast<IdType*>(rhs->data);
+  IdType* ret_data = static_cast<IdType*>(ret->data);
+  auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
+  int nt = cuda::FindNumThreads(len);
+  int nb = (len + nt - 1) / nt;
+  _BinaryElewiseKernel<IdType, Op><<<nb, nt, 0, thr_entry->stream>>>(
+      lhs_data, rhs_data, ret_data, len);
+  return ret;
+}
+
+template IdArray BinaryElewise<kDLGPU, int32_t, arith::Add>(IdArray lhs, IdArray rhs);
+template IdArray BinaryElewise<kDLGPU, int32_t, arith::Sub>(IdArray lhs, IdArray rhs);
+template IdArray BinaryElewise<kDLGPU, int32_t, arith::Mul>(IdArray lhs, IdArray rhs);
+template IdArray BinaryElewise<kDLGPU, int32_t, arith::Div>(IdArray lhs, IdArray rhs);
+template IdArray BinaryElewise<kDLGPU, int32_t, arith::GT>(IdArray lhs, IdArray rhs);
+template IdArray BinaryElewise<kDLGPU, int32_t, arith::LT>(IdArray lhs, IdArray rhs);
+template IdArray BinaryElewise<kDLGPU, int32_t, arith::GE>(IdArray lhs, IdArray rhs);
+template IdArray BinaryElewise<kDLGPU, int32_t, arith::LE>(IdArray lhs, IdArray rhs);
+template IdArray BinaryElewise<kDLGPU, int32_t, arith::EQ>(IdArray lhs, IdArray rhs);
+template IdArray BinaryElewise<kDLGPU, int32_t, arith::NE>(IdArray lhs, IdArray rhs);
+template IdArray BinaryElewise<kDLGPU, int64_t, arith::Add>(IdArray lhs, IdArray rhs);
+template IdArray BinaryElewise<kDLGPU, int64_t, arith::Sub>(IdArray lhs, IdArray rhs);
+template IdArray BinaryElewise<kDLGPU, int64_t, arith::Mul>(IdArray lhs, IdArray rhs);
+template IdArray BinaryElewise<kDLGPU, int64_t, arith::Div>(IdArray lhs, IdArray rhs);
+template IdArray BinaryElewise<kDLGPU, int64_t, arith::GT>(IdArray lhs, IdArray rhs);
+template IdArray BinaryElewise<kDLGPU, int64_t, arith::LT>(IdArray lhs, IdArray rhs);
+template IdArray BinaryElewise<kDLGPU, int64_t, arith::GE>(IdArray lhs, IdArray rhs);
+template IdArray BinaryElewise<kDLGPU, int64_t, arith::LE>(IdArray lhs, IdArray rhs);
+template IdArray BinaryElewise<kDLGPU, int64_t, arith::EQ>(IdArray lhs, IdArray rhs);
+template IdArray BinaryElewise<kDLGPU, int64_t, arith::NE>(IdArray lhs, IdArray rhs);
+
+
+template <typename IdType, typename Op>
+__global__ void _BinaryElewiseKernel(
+    const IdType* lhs, IdType rhs, IdType* out, int64_t length) {
+  int tx = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride_x = gridDim.x * blockDim.x;
+  while (tx < length) {
+    out[tx] = Op::Call(lhs[tx], rhs);
+    tx += stride_x;
+  }
+}
+
+template <DLDeviceType XPU, typename IdType, typename Op>
+IdArray BinaryElewise(IdArray lhs, IdType rhs) {
+  const int64_t len = lhs->shape[0];
+  IdArray ret = NewIdArray(lhs->shape[0], lhs->ctx, lhs->dtype.bits);
+  const IdType* lhs_data = static_cast<IdType*>(lhs->data);
+  IdType* ret_data = static_cast<IdType*>(ret->data);
+  auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
+  int nt = cuda::FindNumThreads(len);
+  int nb = (len + nt - 1) / nt;
+  _BinaryElewiseKernel<IdType, Op><<<nb, nt, 0, thr_entry->stream>>>(
+      lhs_data, rhs, ret_data, len);
+  return ret;
+}
+
+template IdArray BinaryElewise<kDLGPU, int32_t, arith::Add>(IdArray lhs, int32_t rhs);
+template IdArray BinaryElewise<kDLGPU, int32_t, arith::Sub>(IdArray lhs, int32_t rhs);
+template IdArray BinaryElewise<kDLGPU, int32_t, arith::Mul>(IdArray lhs, int32_t rhs);
+template IdArray BinaryElewise<kDLGPU, int32_t, arith::Div>(IdArray lhs, int32_t rhs);
+template IdArray BinaryElewise<kDLGPU, int32_t, arith::GT>(IdArray lhs, int32_t rhs);
+template IdArray BinaryElewise<kDLGPU, int32_t, arith::LT>(IdArray lhs, int32_t rhs);
+template IdArray BinaryElewise<kDLGPU, int32_t, arith::GE>(IdArray lhs, int32_t rhs);
+template IdArray BinaryElewise<kDLGPU, int32_t, arith::LE>(IdArray lhs, int32_t rhs);
+template IdArray BinaryElewise<kDLGPU, int32_t, arith::EQ>(IdArray lhs, int32_t rhs);
+template IdArray BinaryElewise<kDLGPU, int32_t, arith::NE>(IdArray lhs, int32_t rhs);
+template IdArray BinaryElewise<kDLGPU, int64_t, arith::Add>(IdArray lhs, int64_t rhs);
+template IdArray BinaryElewise<kDLGPU, int64_t, arith::Sub>(IdArray lhs, int64_t rhs);
+template IdArray BinaryElewise<kDLGPU, int64_t, arith::Mul>(IdArray lhs, int64_t rhs);
+template IdArray BinaryElewise<kDLGPU, int64_t, arith::Div>(IdArray lhs, int64_t rhs);
+template IdArray BinaryElewise<kDLGPU, int64_t, arith::GT>(IdArray lhs, int64_t rhs);
+template IdArray BinaryElewise<kDLGPU, int64_t, arith::LT>(IdArray lhs, int64_t rhs);
+template IdArray BinaryElewise<kDLGPU, int64_t, arith::GE>(IdArray lhs, int64_t rhs);
+template IdArray BinaryElewise<kDLGPU, int64_t, arith::LE>(IdArray lhs, int64_t rhs);
+template IdArray BinaryElewise<kDLGPU, int64_t, arith::EQ>(IdArray lhs, int64_t rhs);
+template IdArray BinaryElewise<kDLGPU, int64_t, arith::NE>(IdArray lhs, int64_t rhs);
+
+
+
+template <typename IdType, typename Op>
+__global__ void _BinaryElewiseKernel(
+    IdType lhs, const IdType* rhs, IdType* out, int64_t length) {
+  int tx = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride_x = gridDim.x * blockDim.x;
+  while (tx < length) {
+    out[tx] = Op::Call(lhs, rhs[tx]);
+    tx += stride_x;
+  }
+}
+
+template <DLDeviceType XPU, typename IdType, typename Op>
+IdArray BinaryElewise(IdType lhs, IdArray rhs) {
+  const int64_t len = rhs->shape[0];
+  IdArray ret = NewIdArray(rhs->shape[0], rhs->ctx, rhs->dtype.bits);
+  const IdType* rhs_data = static_cast<IdType*>(rhs->data);
+  IdType* ret_data = static_cast<IdType*>(ret->data);
+  auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
+  int nt = cuda::FindNumThreads(len);
+  int nb = (len + nt - 1) / nt;
+  _BinaryElewiseKernel<IdType, Op><<<nb, nt, 0, thr_entry->stream>>>(
+      lhs, rhs_data, ret_data, len);
+  return ret;
+}
+
+template IdArray BinaryElewise<kDLGPU, int32_t, arith::Add>(int32_t lhs, IdArray rhs);
+template IdArray BinaryElewise<kDLGPU, int32_t, arith::Sub>(int32_t lhs, IdArray rhs);
+template IdArray BinaryElewise<kDLGPU, int32_t, arith::Mul>(int32_t lhs, IdArray rhs);
+template IdArray BinaryElewise<kDLGPU, int32_t, arith::Div>(int32_t lhs, IdArray rhs);
+template IdArray BinaryElewise<kDLGPU, int32_t, arith::GT>(int32_t lhs, IdArray rhs);
+template IdArray BinaryElewise<kDLGPU, int32_t, arith::LT>(int32_t lhs, IdArray rhs);
+template IdArray BinaryElewise<kDLGPU, int32_t, arith::GE>(int32_t lhs, IdArray rhs);
+template IdArray BinaryElewise<kDLGPU, int32_t, arith::LE>(int32_t lhs, IdArray rhs);
+template IdArray BinaryElewise<kDLGPU, int32_t, arith::EQ>(int32_t lhs, IdArray rhs);
+template IdArray BinaryElewise<kDLGPU, int32_t, arith::NE>(int32_t lhs, IdArray rhs);
+template IdArray BinaryElewise<kDLGPU, int64_t, arith::Add>(int64_t lhs, IdArray rhs);
+template IdArray BinaryElewise<kDLGPU, int64_t, arith::Sub>(int64_t lhs, IdArray rhs);
+template IdArray BinaryElewise<kDLGPU, int64_t, arith::Mul>(int64_t lhs, IdArray rhs);
+template IdArray BinaryElewise<kDLGPU, int64_t, arith::Div>(int64_t lhs, IdArray rhs);
+template IdArray BinaryElewise<kDLGPU, int64_t, arith::GT>(int64_t lhs, IdArray rhs);
+template IdArray BinaryElewise<kDLGPU, int64_t, arith::LT>(int64_t lhs, IdArray rhs);
+template IdArray BinaryElewise<kDLGPU, int64_t, arith::GE>(int64_t lhs, IdArray rhs);
+template IdArray BinaryElewise<kDLGPU, int64_t, arith::LE>(int64_t lhs, IdArray rhs);
+template IdArray BinaryElewise<kDLGPU, int64_t, arith::EQ>(int64_t lhs, IdArray rhs);
+template IdArray BinaryElewise<kDLGPU, int64_t, arith::NE>(int64_t lhs, IdArray rhs);
+
+template <typename IdType, typename Op>
+__global__ void _UnaryElewiseKernel(
+    const IdType* lhs, IdType* out, int64_t length) {
+  int tx = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride_x = gridDim.x * blockDim.x;
+  while (tx < length) {
+    out[tx] = Op::Call(lhs[tx]);
+    tx += stride_x;
  }
+}
+
+template <DLDeviceType XPU, typename IdType, typename Op>
+IdArray UnaryElewise(IdArray lhs) {
+  const int64_t len = lhs->shape[0];
+  IdArray ret = NewIdArray(lhs->shape[0], lhs->ctx, lhs->dtype.bits);
+  const IdType* lhs_data = static_cast<IdType*>(lhs->data);
+  IdType* ret_data = static_cast<IdType*>(ret->data);
+  auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
+  int nt = cuda::FindNumThreads(len);
+  int nb = (len + nt - 1) / nt;
+  _UnaryElewiseKernel<IdType, Op><<<nb, nt, 0, thr_entry->stream>>>(
+      lhs_data, ret_data, len);
  return ret;
 }

+template IdArray UnaryElewise<kDLGPU, int32_t, arith::Neg>(IdArray lhs);
+template IdArray UnaryElewise<kDLGPU, int64_t, arith::Neg>(IdArray lhs);
+
+///////////////////////////// Full /////////////////////////////
+
+template <typename IdType>
+__global__ void _FullKernel(
+    IdType* out, int64_t length, IdType val) {
+  int tx = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride_x = gridDim.x * blockDim.x;
+  while (tx < length) {
+    out[tx] = val;
+    tx += stride_x;
+  }
+}
+
+template <DLDeviceType XPU, typename IdType>
+IdArray Full(IdType val, int64_t length, DLContext ctx) {
+  IdArray ret = NewIdArray(length, ctx, sizeof(IdType) * 8);
+  IdType* ret_data = static_cast<IdType*>(ret->data);
+  auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
+  int nt = cuda::FindNumThreads(length);
+  int nb = (length + nt - 1) / nt;
+  _FullKernel<IdType><<<nb, nt, 0, thr_entry->stream>>>(ret_data, length, val);
+  return ret;
+}
+
+template IdArray Full<kDLGPU, int32_t>(int32_t val, int64_t length, DLContext ctx);
+template IdArray Full<kDLGPU, int64_t>(int64_t val, int64_t length, DLContext ctx);
+

 ///////////////////////////// Range /////////////////////////////

@@ -41,7 +234,7 @@ IdArray Range(IdType low, IdType high, DLContext ctx) {
    return ret;
  IdType* ret_data = static_cast<IdType*>(ret->data);
  auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
-  int nt = FindNumThreads(length, 1024);
+  int nt = cuda::FindNumThreads(length);
  int nb = (length + nt - 1) / nt;
  _RangeKernel<IdType><<<nb, nt, 0, thr_entry->stream>>>(ret_data, low, length);
  return ret;
@@ -68,7 +261,7 @@ IdArray AsNumBits(IdArray arr, uint8_t bits) {
  IdArray ret = IdArray::Empty(shape, DLDataType{kDLInt, bits, 1}, arr->ctx);
  const int64_t length = ret.NumElements();
  auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
-  int nt = FindNumThreads(length, 1024);
+  int nt = cuda::FindNumThreads(length);
  int nb = (length + nt - 1) / nt;
  if (bits == 32) {
    _CastKernel<IdType, int32_t><<<nb, nt, 0, thr_entry->stream>>>(

--- a/src/array/cuda/spmat_op_impl.cu
+++ b/src/array/cuda/spmat_op_impl.cu
+/*!
+ *  Copyright (c) 2020 by Contributors
+ * \file array/cuda/spmat_op_impl.cu
+ * \brief Sparse matrix operator CPU implementation
+ */
+#include <dgl/array.h>
+#include <vector>
+#include <unordered_set>
+#include <numeric>
+#include "../../runtime/cuda/cuda_common.h"
+#include "../../cuda_utils.h"
+
+namespace dgl {
+
+using runtime::NDArray;
+
+namespace aten {
+namespace impl {
+
+///////////////////////////// CSRIsNonZero /////////////////////////////
+
+/*!
+ * \brief Search adjacency list linearly for each (row, col) pair and
+ * write the matched position in the indices array to the output.
+ * 
+ * If there is no match, -1 is written.
+ * If there are multiple matches, only the first match is written.
+ */
+template <typename IdType>
+__global__ void _LinearSearchKernel(
+    const IdType* indptr, const IdType* indices,
+    const IdType* row, const IdType* col,
+    int64_t row_stride, int64_t col_stride,
+    int64_t length, IdType* out) {
+  int tx = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride_x = gridDim.x * blockDim.x;
+  int rpos = tx, cpos = tx;
+  while (tx < length) {
+    out[tx] = -1;
+    const IdType r = row[rpos], c = col[cpos];
+    for (IdType i = indptr[r]; i < indptr[r + 1]; ++i) {
+      if (indices[i] == c) {
+        out[tx] = i;
+        break;
+      }
+    }
+    rpos += row_stride;
+    cpos += col_stride;
+    tx += stride_x;
+  }
+}
+
+template <DLDeviceType XPU, typename IdType>
+bool CSRIsNonZero(CSRMatrix csr, int64_t row, int64_t col) {
+  auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
+  const auto& ctx = csr.indptr->ctx;
+  IdArray rows = aten::VecToIdArray<int64_t>({row}, sizeof(IdType) * 8, ctx);
+  IdArray cols = aten::VecToIdArray<int64_t>({col}, sizeof(IdType) * 8, ctx);
+  rows = rows.CopyTo(ctx);
+  cols = cols.CopyTo(ctx);
+  IdArray out = aten::NewIdArray(1, ctx, sizeof(IdType) * 8);
+  // TODO(minjie): use binary search for sorted csr
+  _LinearSearchKernel<<<1, 1, 0, thr_entry->stream>>>(
+      csr.indptr.Ptr<IdType>(), csr.indices.Ptr<IdType>(),
+      rows.Ptr<IdType>(), cols.Ptr<IdType>(),
+      1, 1, 1,
+      out.Ptr<IdType>());
+  out = out.CopyTo(DLContext{kDLCPU, 0});
+  return *out.Ptr<IdType>() != -1;
+}
+
+template bool CSRIsNonZero<kDLGPU, int32_t>(CSRMatrix, int64_t, int64_t);
+template bool CSRIsNonZero<kDLGPU, int64_t>(CSRMatrix, int64_t, int64_t);
+
+template <DLDeviceType XPU, typename IdType>
+NDArray CSRIsNonZero(CSRMatrix csr, NDArray row, NDArray col) {
+  const auto rowlen = row->shape[0];
+  const auto collen = col->shape[0];
+  const auto rstlen = std::max(rowlen, collen);
+  NDArray rst = NDArray::Empty({rstlen}, row->dtype, row->ctx);
+  if (rstlen == 0)
+    return rst;
+  const int64_t row_stride = (rowlen == 1 && collen != 1) ? 0 : 1;
+  const int64_t col_stride = (collen == 1 && rowlen != 1) ? 0 : 1;
+  auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
+  const int nt = cuda::FindNumThreads(rstlen);
+  const int nb = (rstlen + nt - 1) / nt;
+  // TODO(minjie): use binary search for sorted csr
+  _LinearSearchKernel<<<nb, nt, 0, thr_entry->stream>>>(
+      csr.indptr.Ptr<IdType>(), csr.indices.Ptr<IdType>(),
+      row.Ptr<IdType>(), col.Ptr<IdType>(),
+      row_stride, col_stride, rstlen,
+      rst.Ptr<IdType>());
+  return rst != -1;
+}
+
+template NDArray CSRIsNonZero<kDLGPU, int32_t>(CSRMatrix, NDArray, NDArray);
+template NDArray CSRIsNonZero<kDLGPU, int64_t>(CSRMatrix, NDArray, NDArray);
+
+///////////////////////////// CSRGetRowNNZ /////////////////////////////
+
+template <DLDeviceType XPU, typename IdType>
+int64_t CSRGetRowNNZ(CSRMatrix csr, int64_t row) {
+  const IdType cur = aten::IndexSelect<IdType>(csr.indptr, row);
+  const IdType next = aten::IndexSelect<IdType>(csr.indptr, row + 1);
+  return next - cur;
+}
+
+template int64_t CSRGetRowNNZ<kDLGPU, int32_t>(CSRMatrix, int64_t);
+template int64_t CSRGetRowNNZ<kDLGPU, int64_t>(CSRMatrix, int64_t);
+
+template <typename IdType>
+__global__ void _CSRGetRowNNZKernel(
+    const IdType* vid,
+    const IdType* indptr,
+    IdType* out,
+    int64_t length) {
+  int tx = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride_x = gridDim.x * blockDim.x;
+  while (tx < length) {
+    const IdType vv = vid[tx];
+    out[tx] = indptr[vv + 1] - indptr[vv];
+    tx += stride_x;
+  }
+}
+
+template <DLDeviceType XPU, typename IdType>
+NDArray CSRGetRowNNZ(CSRMatrix csr, NDArray rows) {
+  auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
+  const auto len = rows->shape[0];
+  const IdType* vid_data = static_cast<IdType*>(rows->data);
+  const IdType* indptr_data = static_cast<IdType*>(csr.indptr->data);
+  NDArray rst = NDArray::Empty({len}, rows->dtype, rows->ctx);
+  IdType* rst_data = static_cast<IdType*>(rst->data);
+  const int nt = cuda::FindNumThreads(len);
+  const int nb = (len + nt - 1) / nt;
+  _CSRGetRowNNZKernel<<<nb, nt, 0, thr_entry->stream>>>(
+      vid_data, indptr_data, rst_data, len);
+  return rst;
+}
+
+template NDArray CSRGetRowNNZ<kDLGPU, int32_t>(CSRMatrix, NDArray);
+template NDArray CSRGetRowNNZ<kDLGPU, int64_t>(CSRMatrix, NDArray);
+
+///////////////////////////// CSRGetRowColumnIndices /////////////////////////////
+
+template <DLDeviceType XPU, typename IdType>
+NDArray CSRGetRowColumnIndices(CSRMatrix csr, int64_t row) {
+  const int64_t len = impl::CSRGetRowNNZ<XPU, IdType>(csr, row);
+  const int64_t offset = aten::IndexSelect<IdType>(csr.indptr, row) * sizeof(IdType);
+  return csr.indices.CreateView({len}, csr.indices->dtype, offset);
+}
+
+template NDArray CSRGetRowColumnIndices<kDLGPU, int32_t>(CSRMatrix, int64_t);
+template NDArray CSRGetRowColumnIndices<kDLGPU, int64_t>(CSRMatrix, int64_t);
+
+///////////////////////////// CSRGetRowData /////////////////////////////
+
+template <DLDeviceType XPU, typename IdType>
+NDArray CSRGetRowData(CSRMatrix csr, int64_t row) {
+  const int64_t len = impl::CSRGetRowNNZ<XPU, IdType>(csr, row);
+  const int64_t offset = aten::IndexSelect<IdType>(csr.indptr, row) * sizeof(IdType);
+  if (aten::CSRHasData(csr))
+    return csr.data.CreateView({len}, csr.data->dtype, offset);
+  else
+    return aten::Range(offset, offset + len, csr.indptr->dtype.bits, csr.indptr->ctx);
+}
+
+template NDArray CSRGetRowData<kDLGPU, int32_t>(CSRMatrix, int64_t);
+template NDArray CSRGetRowData<kDLGPU, int64_t>(CSRMatrix, int64_t);
+
+
+}  // namespace impl
+}  // namespace aten
+}  // namespace dgl
--- a/src/cuda_utils.h
+++ b/src/cuda_utils.h
+/*!
+ *  Copyright (c) 2020 by Contributors
+ * \file cuda_utils.h
+ * \brief Utilities for CUDA kernels.
+ */
+#ifndef DGL_CUDA_UTILS_H_
+#define DGL_CUDA_UTILS_H_
+
+#include <dmlc/logging.h>
+
+namespace dgl {
+namespace cuda {
+
+#define CUDA_MAX_NUM_BLOCKS_X 0x7FFFFFFF
+#define CUDA_MAX_NUM_BLOCKS_Y 0xFFFF
+#define CUDA_MAX_NUM_BLOCKS_Z 0xFFFF
+#define CUDA_MAX_NUM_THREADS 1024
+
+/*! \brief Calculate the number of threads needed given the dimension length.
+ *
+ * It finds the biggest number that is smaller than min(dim, max_nthrs)
+ * and is also power of two.
+ */
+inline int FindNumThreads(int dim, int max_nthrs = CUDA_MAX_NUM_THREADS) {
+  CHECK_NE(dim, 0);
+  int ret = max_nthrs;
+  while (ret > dim) {
+    ret = ret >> 1;
+  }
+  return ret;
+}
+
+}  // namespace cuda
+}  // namespace dgl
+
+#endif  // DGL_CUDA_UTILS_H_
--- a/tests/compute/test_heterograph.py
+++ b/tests/compute/test_heterograph.py
@@ -753,10 +753,13 @@ def test_flatten(index_dtype):
 @unittest.skipIf(F._default_context_str == 'cpu', reason="Need gpu for this test")
 @parametrize_dtype
 def test_to_device(index_dtype):
-    hg = create_test_heterograph(index_dtype)
+    g = create_test_heterograph(index_dtype)
+    g.nodes['user'].data['h'] = F.copy_to(F.ones((3, 5)), F.cpu())
+    g.nodes['game'].data['i'] = F.copy_to(F.ones((2, 5)), F.cpu())
+    g.edges['plays'].data['e'] = F.copy_to(F.ones((4, 4)), F.cpu())
    if F.is_cuda_available():
-        hg = hg.to(F.cuda())
-        assert hg is not None
+        g1 = g.to(F.cuda())
+        assert g1 is not None

 @parametrize_dtype
 def test_convert_bound(index_dtype):

--- a/tests/cpp/test_aten.cc
+++ b/tests/cpp/test_aten.cc
@@ -86,59 +86,106 @@ TEST(ArrayTest, TestAsNumBits) {
 };

 template <typename IDX>
-void _TestArith() {
+void _TestArith(DLContext ctx) {
  const int N = 100;
-  IdArray a = aten::Full(-10, N, sizeof(IDX)*8, CTX);
-  IdArray b = aten::Full(7, N, sizeof(IDX)*8, CTX);
+  IdArray a = aten::Full(-10, N, sizeof(IDX)*8, ctx);
+  IdArray b = aten::Full(7, N, sizeof(IDX)*8, ctx);

-  IdArray c = aten::Add(a, b);
+  IdArray c = a + b;
+  c = c.CopyTo(CPU);
  for (int i = 0; i < N; ++i)
    ASSERT_EQ(Ptr<IDX>(c)[i], -3);
-  c = aten::Sub(a, b);
+  c = a - b;
+  c = c.CopyTo(CPU);
  for (int i = 0; i < N; ++i)
    ASSERT_EQ(Ptr<IDX>(c)[i], -17);
-  c = aten::Mul(a, b);
+  c = a * b;
+  c = c.CopyTo(CPU);
  for (int i = 0; i < N; ++i)
    ASSERT_EQ(Ptr<IDX>(c)[i], -70);
-  c = aten::Div(a, b);
+  c = a / b;
+  c = c.CopyTo(CPU);
  for (int i = 0; i < N; ++i)
    ASSERT_EQ(Ptr<IDX>(c)[i], -1);
+  c = -a;
+  c = c.CopyTo(CPU);
+  for (int i = 0; i < N; ++i)
+    ASSERT_EQ(Ptr<IDX>(c)[i], 10);

  const int val = -3;
  c = aten::Add(a, val);
+  c = c.CopyTo(CPU);
  for (int i = 0; i < N; ++i)
    ASSERT_EQ(Ptr<IDX>(c)[i], -13);
  c = aten::Sub(a, val);
+  c = c.CopyTo(CPU);
  for (int i = 0; i < N; ++i)
    ASSERT_EQ(Ptr<IDX>(c)[i], -7);
  c = aten::Mul(a, val);
+  c = c.CopyTo(CPU);
  for (int i = 0; i < N; ++i)
    ASSERT_EQ(Ptr<IDX>(c)[i], 30);
  c = aten::Div(a, val);
+  c = c.CopyTo(CPU);
  for (int i = 0; i < N; ++i)
    ASSERT_EQ(Ptr<IDX>(c)[i], 3);
  c = aten::Add(val, b);
+  c = c.CopyTo(CPU);
  for (int i = 0; i < N; ++i)
    ASSERT_EQ(Ptr<IDX>(c)[i], 4);
  c = aten::Sub(val, b);
+  c = c.CopyTo(CPU);
  for (int i = 0; i < N; ++i)
    ASSERT_EQ(Ptr<IDX>(c)[i], -10);
  c = aten::Mul(val, b);
+  c = c.CopyTo(CPU);
  for (int i = 0; i < N; ++i)
    ASSERT_EQ(Ptr<IDX>(c)[i], -21);
  c = aten::Div(val, b);
+  c = c.CopyTo(CPU);
  for (int i = 0; i < N; ++i)
    ASSERT_EQ(Ptr<IDX>(c)[i], 0);

-  a = aten::Range(0, N, sizeof(IDX)*8, CTX);
-  c = aten::LT(a, 50);
+  a = aten::Range(0, N, sizeof(IDX)*8, ctx);
+  c = a < 50;
+  c = c.CopyTo(CPU);
  for (int i = 0; i < N; ++i)
    ASSERT_EQ(Ptr<IDX>(c)[i], (int)(i < 50));
+
+  c = a > 50;
+  c = c.CopyTo(CPU);
+  for (int i = 0; i < N; ++i)
+    ASSERT_EQ(Ptr<IDX>(c)[i], (int)(i > 50));
+
+  c = a >= 50;
+  c = c.CopyTo(CPU);
+  for (int i = 0; i < N; ++i)
+    ASSERT_EQ(Ptr<IDX>(c)[i], (int)(i >= 50));
+
+  c = a <= 50;
+  c = c.CopyTo(CPU);
+  for (int i = 0; i < N; ++i)
+    ASSERT_EQ(Ptr<IDX>(c)[i], (int)(i <= 50));
+
+  c = a == 50;
+  c = c.CopyTo(CPU);
+  for (int i = 0; i < N; ++i)
+    ASSERT_EQ(Ptr<IDX>(c)[i], (int)(i == 50));
+
+  c = a != 50;
+  c = c.CopyTo(CPU);
+  for (int i = 0; i < N; ++i)
+    ASSERT_EQ(Ptr<IDX>(c)[i], (int)(i != 50));
+
 }

 TEST(ArrayTest, TestArith) {
-  _TestArith<int32_t>();
-  _TestArith<int64_t>();
+  _TestArith<int32_t>(CPU);
+  _TestArith<int64_t>(CPU);
+#ifdef DGL_USE_CUDA
+  _TestArith<int32_t>(GPU);
+  _TestArith<int64_t>(GPU);
+#endif
 };

 template <typename IDX>
@@ -158,17 +205,21 @@ TEST(ArrayTest, TestHStack) {
 }

 template <typename IDX>
-void _TestIndexSelect() {
-  IdArray a = aten::Range(0, 100, sizeof(IDX)*8, CTX);
+void _TestIndexSelect(DLContext ctx) {
+  IdArray a = aten::Range(0, 100, sizeof(IDX)*8, ctx);
  ASSERT_EQ(aten::IndexSelect<int>(a, 50), 50);
-  IdArray b = aten::VecToIdArray(std::vector<IDX>({0, 20, 10}), sizeof(IDX)*8, CTX);
+  IdArray b = aten::VecToIdArray(std::vector<IDX>({0, 20, 10}), sizeof(IDX)*8, ctx);
  IdArray c = aten::IndexSelect(a, b);
  ASSERT_TRUE(ArrayEQ<IDX>(b, c));
 }

 TEST(ArrayTest, TestIndexSelect) {
-  _TestIndexSelect<int32_t>();
-  _TestIndexSelect<int64_t>();
+  _TestIndexSelect<int32_t>(CPU);
+  _TestIndexSelect<int64_t>(CPU);
+#ifdef DGL_USE_CUDA
+  _TestIndexSelect<int32_t>(GPU);
+  _TestIndexSelect<int64_t>(GPU);
+#endif
 }

 template <typename IDX>

--- a/tests/cpp/test_spmat.cc
+++ b/tests/cpp/test_spmat.cc
@@ -115,74 +115,90 @@ aten::COOMatrix COO3(DLContext ctx) {
 }  // namespace

 template <typename IDX>
-void _TestCSRIsNonZero() {
-  auto csr = CSR1<IDX>();
+void _TestCSRIsNonZero(DLContext ctx) {
+  auto csr = CSR1<IDX>(ctx);
  ASSERT_TRUE(aten::CSRIsNonZero(csr, 0, 1));
  ASSERT_FALSE(aten::CSRIsNonZero(csr, 0, 0));
-  IdArray r = aten::VecToIdArray(std::vector<IDX>({2, 2, 0, 0}), sizeof(IDX)*8, CTX);
-  IdArray c = aten::VecToIdArray(std::vector<IDX>({1, 1, 1, 3}), sizeof(IDX)*8, CTX);
+  IdArray r = aten::VecToIdArray(std::vector<IDX>({2, 2, 0, 0}), sizeof(IDX)*8, ctx);
+  IdArray c = aten::VecToIdArray(std::vector<IDX>({1, 1, 1, 3}), sizeof(IDX)*8, ctx);
  IdArray x = aten::CSRIsNonZero(csr, r, c);
-  IdArray tx = aten::VecToIdArray(std::vector<IDX>({0, 0, 1, 0}), sizeof(IDX)*8, CTX);
+  IdArray tx = aten::VecToIdArray(std::vector<IDX>({0, 0, 1, 0}), sizeof(IDX)*8, ctx);
  ASSERT_TRUE(ArrayEQ<IDX>(x, tx));
 }

 TEST(SpmatTest, TestCSRIsNonZero) {
-  _TestCSRIsNonZero<int32_t>();
-  _TestCSRIsNonZero<int64_t>();
+  _TestCSRIsNonZero<int32_t>(CPU);
+  _TestCSRIsNonZero<int64_t>(CPU);
+#ifdef DGL_USE_CUDA
+  _TestCSRIsNonZero<int32_t>(GPU);
+  _TestCSRIsNonZero<int64_t>(GPU);
+#endif
 }

 template <typename IDX>
-void _TestCSRGetRowNNZ() {
-  auto csr = CSR2<IDX>();
+void _TestCSRGetRowNNZ(DLContext ctx) {
+  auto csr = CSR2<IDX>(ctx);
  ASSERT_EQ(aten::CSRGetRowNNZ(csr, 0), 3);
  ASSERT_EQ(aten::CSRGetRowNNZ(csr, 3), 0);
-  IdArray r = aten::VecToIdArray(std::vector<IDX>({0, 3}), sizeof(IDX)*8, CTX);
+  IdArray r = aten::VecToIdArray(std::vector<IDX>({0, 3}), sizeof(IDX)*8, ctx);
  IdArray x = aten::CSRGetRowNNZ(csr, r);
-  IdArray tx = aten::VecToIdArray(std::vector<IDX>({3, 0}), sizeof(IDX)*8, CTX);
+  IdArray tx = aten::VecToIdArray(std::vector<IDX>({3, 0}), sizeof(IDX)*8, ctx);
  ASSERT_TRUE(ArrayEQ<IDX>(x, tx));
 }

 TEST(SpmatTest, TestCSRGetRowNNZ) {
-  _TestCSRGetRowNNZ<int32_t>();
-  _TestCSRGetRowNNZ<int64_t>();
+  _TestCSRGetRowNNZ<int32_t>(CPU);
+  _TestCSRGetRowNNZ<int64_t>(CPU);
+#ifdef DGL_USE_CUDA
+  _TestCSRGetRowNNZ<int32_t>(GPU);
+  _TestCSRGetRowNNZ<int64_t>(GPU);
+#endif
 }

 template <typename IDX>
-void _TestCSRGetRowColumnIndices() {
-  auto csr = CSR2<IDX>();
+void _TestCSRGetRowColumnIndices(DLContext ctx) {
+  auto csr = CSR2<IDX>(ctx);
  auto x = aten::CSRGetRowColumnIndices(csr, 0);
-  auto tx = aten::VecToIdArray(std::vector<IDX>({1, 2, 2}), sizeof(IDX)*8, CTX);
+  auto tx = aten::VecToIdArray(std::vector<IDX>({1, 2, 2}), sizeof(IDX)*8, ctx);
  ASSERT_TRUE(ArrayEQ<IDX>(x, tx));
  x = aten::CSRGetRowColumnIndices(csr, 1);
-  tx = aten::VecToIdArray(std::vector<IDX>({0}), sizeof(IDX)*8, CTX);
+  tx = aten::VecToIdArray(std::vector<IDX>({0}), sizeof(IDX)*8, ctx);
  ASSERT_TRUE(ArrayEQ<IDX>(x, tx));
  x = aten::CSRGetRowColumnIndices(csr, 3);
-  tx = aten::VecToIdArray(std::vector<IDX>({}), sizeof(IDX)*8, CTX);
+  tx = aten::VecToIdArray(std::vector<IDX>({}), sizeof(IDX)*8, ctx);
  ASSERT_TRUE(ArrayEQ<IDX>(x, tx));
 }

 TEST(SpmatTest, TestCSRGetRowColumnIndices) {
-  _TestCSRGetRowColumnIndices<int32_t>();
-  _TestCSRGetRowColumnIndices<int64_t>();
+  _TestCSRGetRowColumnIndices<int32_t>(CPU);
+  _TestCSRGetRowColumnIndices<int64_t>(CPU);
+#ifdef DGL_USE_CUDA
+  _TestCSRGetRowColumnIndices<int32_t>(GPU);
+  _TestCSRGetRowColumnIndices<int64_t>(GPU);
+#endif
 }

 template <typename IDX>
-void _TestCSRGetRowData() {
-  auto csr = CSR2<IDX>();
+void _TestCSRGetRowData(DLContext ctx) {
+  auto csr = CSR2<IDX>(ctx);
  auto x = aten::CSRGetRowData(csr, 0);
-  auto tx = aten::VecToIdArray(std::vector<IDX>({0, 2, 5}), sizeof(IDX)*8, CTX);
+  auto tx = aten::VecToIdArray(std::vector<IDX>({0, 2, 5}), sizeof(IDX)*8, ctx);
  ASSERT_TRUE(ArrayEQ<IDX>(x, tx));
  x = aten::CSRGetRowData(csr, 1);
-  tx = aten::VecToIdArray(std::vector<IDX>({3}), sizeof(IDX)*8, CTX);
+  tx = aten::VecToIdArray(std::vector<IDX>({3}), sizeof(IDX)*8, ctx);
  ASSERT_TRUE(ArrayEQ<IDX>(x, tx));
  x = aten::CSRGetRowData(csr, 3);
-  tx = aten::VecToIdArray(std::vector<IDX>({}), sizeof(IDX)*8, CTX);
+  tx = aten::VecToIdArray(std::vector<IDX>({}), sizeof(IDX)*8, ctx);
  ASSERT_TRUE(ArrayEQ<IDX>(x, tx));
 }

 TEST(SpmatTest, TestCSRGetRowData) {
-  _TestCSRGetRowData<int32_t>();
-  _TestCSRGetRowData<int64_t>();
+  _TestCSRGetRowData<int32_t>(CPU);
+  _TestCSRGetRowData<int64_t>(CPU);
+#ifdef DGL_USE_CUDA
+  _TestCSRGetRowData<int32_t>(GPU);
+  _TestCSRGetRowData<int64_t>(GPU);
+#endif
 }

 template <typename IDX>