[hotfix] Skip CUDA kernel launch when number of blocks/threads is zero. (#2144)

* upd * upd * upd * upd * lint * upd * upd * fmt Co-authored-by: Quan (Andy) Gan <coin2028@hotmail.com>

[hotfix] Skip CUDA kernel launch when number of blocks/threads is zero. (#2144)
* upd * upd * upd * upd * lint * upd * upd * fmt Co-authored-by: Quan (Andy) Gan <coin2028@hotmail.com>
2c04ecb5 · Zihao Ye · GitHub · 5f44a4ef · 2c04ecb5 · 2c04ecb5
Unverified Commit 2c04ecb5 authored Sep 10, 2020 by Zihao Ye Committed by GitHub Sep 10, 2020
14 changed files
--- a/src/array/cuda/array_index_select.cu
+++ b/src/array/cuda/array_index_select.cu
@@ -36,7 +36,8 @@ NDArray IndexSelect(NDArray array, IdArray index) {
  DType* ret_data = static_cast<DType*>(ret->data);
  const int nt = cuda::FindNumThreads(len);
  const int nb = (len + nt - 1) / nt;
-  _IndexSelectKernel<<<nb, nt, 0, thr_entry->stream>>>(array_data, idx_data, len, ret_data);
+  CUDA_KERNEL_CALL(_IndexSelectKernel, nb, nt, 0, thr_entry->stream,
+      array_data, idx_data, len, ret_data);
  return ret;
 }


--- a/src/array/cuda/array_op_impl.cu
+++ b/src/array/cuda/array_op_impl.cu
@@ -36,7 +36,8 @@ IdArray BinaryElewise(IdArray lhs, IdArray rhs) {
  auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
  int nt = cuda::FindNumThreads(len);
  int nb = (len + nt - 1) / nt;
-  _BinaryElewiseKernel<IdType, Op><<<nb, nt, 0, thr_entry->stream>>>(
+  CUDA_KERNEL_CALL((_BinaryElewiseKernel<IdType, Op>),
+      nb, nt, 0, thr_entry->stream,
      lhs_data, rhs_data, ret_data, len);
  return ret;
 }
@@ -85,7 +86,8 @@ IdArray BinaryElewise(IdArray lhs, IdType rhs) {
  auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
  int nt = cuda::FindNumThreads(len);
  int nb = (len + nt - 1) / nt;
-  _BinaryElewiseKernel<IdType, Op><<<nb, nt, 0, thr_entry->stream>>>(
+  CUDA_KERNEL_CALL((_BinaryElewiseKernel<IdType, Op>),
+      nb, nt, 0, thr_entry->stream,
      lhs_data, rhs, ret_data, len);
  return ret;
 }
@@ -135,7 +137,8 @@ IdArray BinaryElewise(IdType lhs, IdArray rhs) {
  auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
  int nt = cuda::FindNumThreads(len);
  int nb = (len + nt - 1) / nt;
-  _BinaryElewiseKernel<IdType, Op><<<nb, nt, 0, thr_entry->stream>>>(
+  CUDA_KERNEL_CALL((_BinaryElewiseKernel<IdType, Op>),
+      nb, nt, 0, thr_entry->stream,
      lhs, rhs_data, ret_data, len);
  return ret;
 }
@@ -183,7 +186,8 @@ IdArray UnaryElewise(IdArray lhs) {
  auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
  int nt = cuda::FindNumThreads(len);
  int nb = (len + nt - 1) / nt;
-  _UnaryElewiseKernel<IdType, Op><<<nb, nt, 0, thr_entry->stream>>>(
+  CUDA_KERNEL_CALL((_UnaryElewiseKernel<IdType, Op>),
+      nb, nt, 0, thr_entry->stream,
      lhs_data, ret_data, len);
  return ret;
 }
@@ -211,7 +215,8 @@ IdArray Full(IdType val, int64_t length, DLContext ctx) {
  auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
  int nt = cuda::FindNumThreads(length);
  int nb = (length + nt - 1) / nt;
-  _FullKernel<IdType><<<nb, nt, 0, thr_entry->stream>>>(ret_data, length, val);
+  CUDA_KERNEL_CALL((_FullKernel<IdType>), nb, nt, 0, thr_entry->stream,
+      ret_data, length, val);
  return ret;
 }

@@ -242,7 +247,9 @@ IdArray Range(IdType low, IdType high, DLContext ctx) {
  auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
  int nt = cuda::FindNumThreads(length);
  int nb = (length + nt - 1) / nt;
-  _RangeKernel<IdType><<<nb, nt, 0, thr_entry->stream>>>(ret_data, low, length);
+  CUDA_KERNEL_CALL((_RangeKernel<IdType>),
+      nb, nt, 0, thr_entry->stream,
+      ret_data, low, length);
  return ret;
 }

@@ -270,10 +277,12 @@ IdArray AsNumBits(IdArray arr, uint8_t bits) {
  int nt = cuda::FindNumThreads(length);
  int nb = (length + nt - 1) / nt;
  if (bits == 32) {
-    _CastKernel<IdType, int32_t><<<nb, nt, 0, thr_entry->stream>>>(
+    CUDA_KERNEL_CALL((_CastKernel<IdType, int32_t>),
+        nb, nt, 0, thr_entry->stream,
        static_cast<IdType*>(arr->data), static_cast<int32_t*>(ret->data), length);
  } else {
-    _CastKernel<IdType, int64_t><<<nb, nt, 0, thr_entry->stream>>>(
+    CUDA_KERNEL_CALL((_CastKernel<IdType, int64_t>),
+        nb, nt, 0, thr_entry->stream,
        static_cast<IdType*>(arr->data), static_cast<int64_t*>(ret->data), length);
  }
  return ret;

--- a/src/array/cuda/array_scatter.cu
+++ b/src/array/cuda/array_scatter.cu
@@ -33,7 +33,8 @@ void Scatter_(IdArray index, NDArray value, NDArray out) {
  auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
  const int nt = cuda::FindNumThreads(len);
  const int nb = (len + nt - 1) / nt;
-  _ScatterKernel<<<nb, nt, 0, thr_entry->stream>>>(idx, val, len, outd);
+  CUDA_KERNEL_CALL(_ScatterKernel, nb, nt, 0, thr_entry->stream,
+      idx, val, len, outd);
 }

 template void Scatter_<kDLGPU, int32_t, int32_t>(IdArray, NDArray, NDArray);

--- a/src/array/cuda/coo2csr.cu
+++ b/src/array/cuda/coo2csr.cu
@@ -131,7 +131,8 @@ CSRMatrix COOToCSR<kDLGPU, int64_t>(COOMatrix coo) {
  const int nt = cuda::FindNumThreads(coo.num_rows);
  const int nb = (coo.num_rows + nt - 1) / nt;
  IdArray indptr = Full(0, coo.num_rows + 1, nbits, ctx);
-  _SortedSearchKernelUpperBound<<<nb, nt, 0, thr_entry->stream>>>(
+  CUDA_KERNEL_CALL(_SortedSearchKernelUpperBound,
+      nb, nt, 0, thr_entry->stream,
      coo.row.Ptr<int64_t>(), nnz,
      rowids.Ptr<int64_t>(), coo.num_rows,
      indptr.Ptr<int64_t>() + 1);

--- a/src/array/cuda/coo_sort.cu
+++ b/src/array/cuda/coo_sort.cu
@@ -155,7 +155,7 @@ std::pair<bool, bool> COOIsSorted(COOMatrix coo) {
  int8_t* col_flags = static_cast<int8_t*>(device->AllocWorkspace(ctx, nnz));
  const int nt = cuda::FindNumThreads(nnz);
  const int nb = (nnz + nt - 1) / nt;
-  _COOIsSortedKernel<<<nb, nt, 0, thr_entry->stream>>>(
+  CUDA_KERNEL_CALL(_COOIsSortedKernel, nb, nt, 0, thr_entry->stream,
      coo.row.Ptr<IdType>(), coo.col.Ptr<IdType>(),
      nnz, row_flags, col_flags);


--- a/src/array/cuda/csr2coo.cu
+++ b/src/array/cuda/csr2coo.cu
@@ -91,7 +91,8 @@ COOMatrix CSRToCOO<kDLGPU, int64_t>(CSRMatrix csr) {

  const int nt = cuda::FindNumThreads(csr.num_rows);
  const int nb = (csr.num_rows + nt - 1) / nt;
-  _RepeatKernel<<<nb, nt, 0, thr_entry->stream>>>(
+  CUDA_KERNEL_CALL(_RepeatKernel,
+      nb, nt, 0, thr_entry->stream,
      rowids.Ptr<int64_t>(), row_nnz.Ptr<int64_t>(),
      csr.indptr.Ptr<int64_t>(), ret_row.Ptr<int64_t>(),
      csr.num_rows);

--- a/src/array/cuda/csr_sort.cu
+++ b/src/array/cuda/csr_sort.cu
@@ -44,7 +44,8 @@ bool CSRIsSorted(CSRMatrix csr) {
  int8_t* flags = static_cast<int8_t*>(device->AllocWorkspace(ctx, csr.num_rows));
  const int nt = cuda::FindNumThreads(csr.num_rows);
  const int nb = (csr.num_rows + nt - 1) / nt;
-  _SegmentIsSorted<<<nb, nt, 0, thr_entry->stream>>>(
+  CUDA_KERNEL_CALL(_SegmentIsSorted,
+      nb, nt, 0, thr_entry->stream,
      csr.indptr.Ptr<IdType>(), csr.indices.Ptr<IdType>(),
      csr.num_rows, flags);
  bool ret = cuda::AllTrue(flags, csr.num_rows, ctx);

--- a/src/array/cuda/sddmm.cuh
+++ b/src/array/cuda/sddmm.cuh
@@ -182,14 +182,13 @@ void SDDMMCoo(
  const bool use_idx = !IsNullArray(coo.data);

  BCAST_IDX_CTX_SWITCH(bcast, use_idx, out->ctx, lhs_off, rhs_off, {
-    SDDMMCooKernel<Idx, DType, Op, UseBcast, UseIdx, LhsTarget, RhsTarget>
-      <<<nblks, nthrs, 0, thr_entry->stream>>>(
+    CUDA_KERNEL_CALL((SDDMMCooKernel<Idx, DType, Op, UseBcast, UseIdx, LhsTarget, RhsTarget>),
+        nblks, nthrs, 0, thr_entry->stream,
        lhs_data, rhs_data, out_data,
        row, col, edge_map,
        coo.num_rows, coo.num_cols, nnz, reduce_dim,
        lhs_off, rhs_off,
-        lhs_len, rhs_len, len
-      );
+        lhs_len, rhs_len, len);
  });
 }

@@ -233,14 +232,13 @@ void SDDMMCsr(
  const bool use_idx = !IsNullArray(csr.data);

  BCAST_IDX_CTX_SWITCH(bcast, use_idx, out->ctx, lhs_off, rhs_off, {
-    SDDMMCsrKernel<Idx, DType, Op, UseBcast, UseIdx, LhsTarget, RhsTarget>
-      <<<nblks, nthrs, 0, thr_entry->stream>>>(
+    CUDA_KERNEL_CALL((SDDMMCsrKernel<Idx, DType, Op, UseBcast, UseIdx, LhsTarget, RhsTarget>),
+        nblks, nthrs, 0, thr_entry->stream,
        lhs_data, rhs_data, out_data,
        indptr, indices, edge_map,
        N, M, E, reduce_dim,
        lhs_off, rhs_off,
-        lhs_len, rhs_len, len
-      );
+        lhs_len, rhs_len, len);
  });
 }


--- a/src/array/cuda/spmat_op_impl_csr.cu
+++ b/src/array/cuda/spmat_op_impl_csr.cu
@@ -61,7 +61,8 @@ bool CSRIsNonZero(CSRMatrix csr, int64_t row, int64_t col) {
  IdArray out = aten::NewIdArray(1, ctx, sizeof(IdType) * 8);
  const IdType* data = nullptr;
  // TODO(minjie): use binary search for sorted csr
-  _LinearSearchKernel<<<1, 1, 0, thr_entry->stream>>>(
+  CUDA_KERNEL_CALL(_LinearSearchKernel,
+      1, 1, 0, thr_entry->stream,
      csr.indptr.Ptr<IdType>(), csr.indices.Ptr<IdType>(), data,
      rows.Ptr<IdType>(), cols.Ptr<IdType>(),
      1, 1, 1,
@@ -88,7 +89,8 @@ NDArray CSRIsNonZero(CSRMatrix csr, NDArray row, NDArray col) {
  const int nb = (rstlen + nt - 1) / nt;
  const IdType* data = nullptr;
  // TODO(minjie): use binary search for sorted csr
-  _LinearSearchKernel<<<nb, nt, 0, thr_entry->stream>>>(
+  CUDA_KERNEL_CALL(_LinearSearchKernel,
+      nb, nt, 0, thr_entry->stream,
      csr.indptr.Ptr<IdType>(), csr.indices.Ptr<IdType>(), data,
      row.Ptr<IdType>(), col.Ptr<IdType>(),
      row_stride, col_stride, rstlen,
@@ -134,7 +136,8 @@ bool CSRHasDuplicate(CSRMatrix csr) {
  int8_t* flags = static_cast<int8_t*>(device->AllocWorkspace(ctx, csr.num_rows));
  const int nt = cuda::FindNumThreads(csr.num_rows);
  const int nb = (csr.num_rows + nt - 1) / nt;
-  _SegmentHasNoDuplicate<<<nb, nt, 0, thr_entry->stream>>>(
+  CUDA_KERNEL_CALL(_SegmentHasNoDuplicate,
+      nb, nt, 0, thr_entry->stream,
      csr.indptr.Ptr<IdType>(), csr.indices.Ptr<IdType>(),
      csr.num_rows, flags);
  bool ret = cuda::AllTrue(flags, csr.num_rows, ctx);
@@ -182,7 +185,8 @@ NDArray CSRGetRowNNZ(CSRMatrix csr, NDArray rows) {
  IdType* rst_data = static_cast<IdType*>(rst->data);
  const int nt = cuda::FindNumThreads(len);
  const int nb = (len + nt - 1) / nt;
-  _CSRGetRowNNZKernel<<<nb, nt, 0, thr_entry->stream>>>(
+  CUDA_KERNEL_CALL(_CSRGetRowNNZKernel,
+      nb, nt, 0, thr_entry->stream,
      vid_data, indptr_data, rst_data, len);
  return rst;
 }
@@ -281,13 +285,15 @@ CSRMatrix CSRSliceRows(CSRMatrix csr, NDArray rows) {
  const int nb = (len + nt - 1) / nt;
  // Copy indices.
  IdArray ret_indices = NDArray::Empty({nnz}, csr.indptr->dtype, csr.indptr->ctx);
-  _SegmentCopyKernel<<<nb, nt, 0, thr_entry->stream>>>(
+  CUDA_KERNEL_CALL(_SegmentCopyKernel,
+      nb, nt, 0, thr_entry->stream,
      csr.indptr.Ptr<IdType>(), csr.indices.Ptr<IdType>(),
      rows.Ptr<IdType>(), 1, len,
      ret_indptr.Ptr<IdType>(), ret_indices.Ptr<IdType>());
  // Copy data.
  IdArray ret_data = NDArray::Empty({nnz}, csr.indptr->dtype, csr.indptr->ctx);
-  _SegmentCopyKernel<<<nb, nt, 0, thr_entry->stream>>>(
+  CUDA_KERNEL_CALL(_SegmentCopyKernel,
+      nb, nt, 0, thr_entry->stream,
      csr.indptr.Ptr<IdType>(), CSRHasData(csr)? csr.data.Ptr<IdType>() : nullptr,
      rows.Ptr<IdType>(), 1, len,
      ret_indptr.Ptr<IdType>(), ret_data.Ptr<IdType>());
@@ -321,7 +327,8 @@ IdArray CSRGetData(CSRMatrix csr, NDArray row, NDArray col) {
  const int nt = cuda::FindNumThreads(rstlen);
  const int nb = (rstlen + nt - 1) / nt;
  // TODO(minjie): use binary search for sorted csr
-  _LinearSearchKernel<<<nb, nt, 0, thr_entry->stream>>>(
+  CUDA_KERNEL_CALL(_LinearSearchKernel,
+      nb, nt, 0, thr_entry->stream,
      csr.indptr.Ptr<IdType>(), csr.indices.Ptr<IdType>(),
      CSRHasData(csr)? csr.data.Ptr<IdType>() : nullptr,
      row.Ptr<IdType>(), col.Ptr<IdType>(),
@@ -422,7 +429,8 @@ std::vector<NDArray> CSRGetDataAndIndices(CSRMatrix csr, NDArray row, NDArray co
  IdArray mask = Full(0, nnz, nbits, ctx);
  const int nt = cuda::FindNumThreads(len);
  const int nb = (len + nt - 1) / nt;
-  _SegmentMaskKernel<<<nb, nt, 0, thr_entry->stream>>>(
+  CUDA_KERNEL_CALL(_SegmentMaskKernel,
+      nb, nt, 0, thr_entry->stream,
      csr.indptr.Ptr<IdType>(), csr.indices.Ptr<IdType>(),
      row.Ptr<IdType>(), col.Ptr<IdType>(),
      row_stride, col_stride, len,
@@ -437,7 +445,8 @@ std::vector<NDArray> CSRGetDataAndIndices(CSRMatrix csr, NDArray row, NDArray co
  IdArray ret_row = NewIdArray(idx->shape[0], ctx, nbits);
  const int nt2 = cuda::FindNumThreads(idx->shape[0]);
  const int nb2 = (idx->shape[0] + nt - 1) / nt;
-  _SortedSearchKernel<<<nb2, nt2, 0, thr_entry->stream>>>(
+  CUDA_KERNEL_CALL(_SortedSearchKernel,
+      nb2, nt2, 0, thr_entry->stream,
      csr.indptr.Ptr<IdType>(), csr.num_rows,
      idx.Ptr<IdType>(), idx->shape[0],
      ret_row.Ptr<IdType>());
@@ -512,7 +521,8 @@ CSRMatrix CSRSliceMatrix(CSRMatrix csr, runtime::NDArray rows, runtime::NDArray
  IdArray count = NewIdArray(csr.num_rows, ctx, nbits);
  const int nt = cuda::FindNumThreads(csr.num_rows);
  const int nb = (csr.num_rows + nt - 1) / nt;
-  _SegmentMaskColKernel<<<nb, nt, 0, thr_entry->stream>>>(
+  CUDA_KERNEL_CALL(_SegmentMaskColKernel,
+      nb, nt, 0, thr_entry->stream,
      csr.indptr.Ptr<IdType>(), csr.indices.Ptr<IdType>(), csr.num_rows,
      cols.Ptr<IdType>(), cols->shape[0],
      mask.Ptr<IdType>(), count.Ptr<IdType>());

--- a/src/array/cuda/spmm.cu
+++ b/src/array/cuda/spmm.cu
@@ -21,7 +21,7 @@ void _Fill(DType* ptr, size_t length, DType val) {
  auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
  int nt = FindNumThreads(length);
  int nb = (length + nt - 1) / nt;  // on x-axis, no need to worry about upperbound.
-  cuda::_FillKernel<<<nb, nt, 0, thr_entry->stream>>>(ptr, length, val);
+  CUDA_KERNEL_CALL(cuda::_FillKernel, nb, nt, 0, thr_entry->stream, ptr, length, val);
 }

 }  // namespace

--- a/src/array/cuda/spmm.cuh
+++ b/src/array/cuda/spmm.cuh
@@ -224,7 +224,8 @@ void SpMMCoo(
  int64_t out_size = out.NumElements();
  const int nt = FindNumThreads(out_size);
  const int nb = (out_size + nt - 1) / nt;
-  _FillKernel<<<nb, nt, 0, thr_entry->stream>>>(out_data, out_size, ReduceOp::zero);
+  CUDA_KERNEL_CALL(_FillKernel, nb, nt, 0, thr_entry->stream,
+      out_data, out_size, ReduceOp::zero);

  const int ntx = FindNumThreads(len);
  const int nty = CUDA_MAX_NUM_THREADS / ntx;
@@ -236,23 +237,21 @@ void SpMMCoo(
  const bool use_idx = !IsNullArray(coo.data);

  BCAST_IDX_CTX_SWITCH(bcast, use_idx, ufeat->ctx, ubcast_off, ebcast_off, {
-    SpMMCooKernel<Idx, DType, BinaryOp, ReduceOp, UseBcast, UseIdx>
-      <<<nblks, nthrs, 0, thr_entry->stream>>>(
+    CUDA_KERNEL_CALL((SpMMCooKernel<Idx, DType, BinaryOp, ReduceOp, UseBcast, UseIdx>),
+        nblks, nthrs, 0, thr_entry->stream,
        ufeat_data, efeat_data, out_data, argu_data, arge_data,
        row, col, edge_map,
        N, M, E,
        ubcast_off, ebcast_off,
-        lhs_len, rhs_len, len
-      );
+        lhs_len, rhs_len, len);
    if (ReduceOp::require_arg) {
-      ArgSpMMCooKernel<Idx, DType, BinaryOp, ReduceOp, UseBcast, UseIdx>
-        <<<nblks, nthrs, 0, thr_entry->stream>>>(
+      CUDA_KERNEL_CALL((ArgSpMMCooKernel<Idx, DType, BinaryOp, ReduceOp, UseBcast, UseIdx>),
+          nblks, nthrs, 0, thr_entry->stream,
          ufeat_data, efeat_data, out_data, argu_data, arge_data,
          row, col, edge_map,
          N, M, E,
          ubcast_off, ebcast_off,
-          lhs_len, rhs_len, len
-        );
+          lhs_len, rhs_len, len);
    }
  });
 }
@@ -303,14 +302,13 @@ void SpMMCsr(
  const bool use_idx = !IsNullArray(csr.data);

  BCAST_IDX_CTX_SWITCH(bcast, use_idx, ufeat->ctx, ubcast_off, ebcast_off, {
-    SpMMCsrKernel<Idx, DType, BinaryOp, ReduceOp, UseBcast, UseIdx>
-      <<<nblks, nthrs, 0, thr_entry->stream>>>(
+    CUDA_KERNEL_CALL((SpMMCsrKernel<Idx, DType, BinaryOp, ReduceOp, UseBcast, UseIdx>),
+        nblks, nthrs, 0, thr_entry->stream,
        ufeat_data, efeat_data, out_data, argu_data, arge_data,
        indptr, indices, edge_map,
        csr.num_rows, csr.num_cols, efeat->shape[0],
        ubcast_off, ebcast_off,
-        lhs_len, rhs_len, len
-      );
+        lhs_len, rhs_len, len)
  });
 }


--- a/src/geometry/cuda/geometry_op_impl.cu
+++ b/src/geometry/cuda/geometry_op_impl.cu
@@ -108,7 +108,8 @@ void FarthestPointSampler(NDArray array, int64_t batch_size, int64_t sample_poin
  // sample for each cloud in the batch
  IdType* start_idx_data = static_cast<IdType*>(start_idx->data);

-  fps_kernel<<<batch_size, THREADS, 0, thr_entry->stream>>>(
+  CUDA_KERNEL_CALL(fps_kernel,
+      batch_size, THREADS, 0, thr_entry->stream,
      array_data, batch_size, sample_points,
      point_in_batch, dim, start_idx_data, dist_data, ret_data);
 }

--- a/src/kernel/cuda/utils.cu
+++ b/src/kernel/cuda/utils.cu
@@ -25,7 +25,7 @@ void Fill(const DLContext& ctx, DType* ptr, size_t length, DType val) {
  auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
  int nt = utils::FindNumThreads(length, 1024);
  int nb = (length + nt - 1) / nt;
-  _FillKernel<<<nb, nt, 0, thr_entry->stream>>>(ptr, length, val);
+  CUDA_KERNEL_CALL(_FillKernel, nb, nt, 0, thr_entry->stream, ptr, length, val);
 }

 template void Fill<kDLGPU, float>(const DLContext& ctx, float* ptr, size_t length, float val);

--- a/src/runtime/cuda/cuda_common.h
+++ b/src/runtime/cuda/cuda_common.h
@@ -16,6 +16,16 @@
 namespace dgl {
 namespace runtime {

+template <typename T>
+inline bool is_zero(T size) {
+    return size == 0;
+}
+
+template <>
+inline bool is_zero<dim3>(dim3 size) {
+    return size.x == 0 || size.y == 0 || size.z == 0;
+}
+
 #define CUDA_DRIVER_CALL(x)                                             \
  {                                                                     \
    CUresult result = x;                                                \
@@ -34,6 +44,19 @@ namespace runtime {
        << "CUDA: " << cudaGetErrorString(e);                      \
  }

+#define CUDA_KERNEL_CALL(kernel, nblks, nthrs, shmem, stream, ...) \
+  {                                                                \
+    if (!dgl::runtime::is_zero((nblks)) &&                         \
+        !dgl::runtime::is_zero((nthrs))) {                         \
+      (kernel) <<< (nblks), (nthrs), (shmem), (stream) >>>         \
+        (__VA_ARGS__);                                             \
+      cudaError_t e = cudaGetLastError();                          \
+      CHECK(e == cudaSuccess || e == cudaErrorCudartUnloading)     \
+          << "CUDA kernel launch error: "                          \
+          << cudaGetErrorString(e);                                \
+    }                                                              \
+  }
+
 #define CUSPARSE_CALL(func)                                        \
  {                                                                \
    cusparseStatus_t e = (func);                                   \