wrap all cuda kernel calls with macro (#4066)

Co-authored-by: nv-dlasalle <63612878+nv-dlasalle@users.noreply.github.com> Co-authored-by: Israt Nisa <neesha295@gmail.com>

wrap all cuda kernel calls with macro (#4066)
Co-authored-by: nv-dlasalle <63612878+nv-dlasalle@users.noreply.github.com> Co-authored-by: Israt Nisa <neesha295@gmail.com>
6014623d · Xin Yao · GitHub · a3ea4873 · 6014623d · 6014623d
Unverified Commit 6014623d authored Jun 06, 2022 by Xin Yao Committed by GitHub Jun 06, 2022
5 changed files
--- a/src/array/cuda/cuda_filter.cu
+++ b/src/array/cuda/cuda_filter.cu
@@ -6,6 +6,7 @@
 #include <dgl/runtime/device_api.h>
+#include "../../runtime/cuda/cuda_common.h"
 #include "../filter.h"
 #include "../../runtime/cuda/cuda_hashtable.cuh"
 #include "./dgl_cub.cuh"
@@ -74,12 +75,12 @@ IdArray _PerformFilter(
    const dim3 block(256);
    const dim3 grid((size+block.x-1)/block.x);
-    _IsInKernel<IdType, include><<<grid, block, 0, stream>>>(
+    CUDA_KERNEL_CALL((_IsInKernel<IdType, include>),
+        grid, block, 0, stream,
        table.DeviceHandle(),
        static_cast<const IdType*>(test->data),
        size,
        prefix);
-    CUDA_CALL(cudaGetLastError());
  }
  // generate prefix-sum
@@ -117,11 +118,11 @@ IdArray _PerformFilter(
    const dim3 block(256);
    const dim3 grid((size+block.x-1)/block.x);
-    _InsertKernel<<<grid, block, 0, stream>>>(
+    CUDA_KERNEL_CALL(_InsertKernel,
+        grid, block, 0, stream,
        prefix,
        size,
        static_cast<IdType*>(result->data));
-    CUDA_CALL(cudaGetLastError());
  }
  device->FreeWorkspace(ctx, prefix);

--- a/src/graph/sampling/randomwalks/frequency_hashmap.cu
+++ b/src/graph/sampling/randomwalks/frequency_hashmap.cu
@@ -251,7 +251,8 @@ FrequencyHashmap<IdxType>::FrequencyHashmap(
  dim3 block(BLOCK_SIZE);
  dim3 grid((num_dst * num_items_each_dst + TILE_SIZE - 1) / TILE_SIZE);
  cudaMemset(dst_unique_edges, 0, (num_dst) * sizeof(IdxType));
-  _init_edge_table<IdxType, BLOCK_SIZE, TILE_SIZE><<<grid, block, 0, _stream>>>(
+  CUDA_KERNEL_CALL((_init_edge_table<IdxType, BLOCK_SIZE, TILE_SIZE>),
+      grid, block, 0, _stream,
      edge_hashmap, (num_dst * num_items_each_dst));
  _device_edge_hashmap = new DeviceEdgeHashmap<IdxType>(
      num_dst, num_items_each_dst, dst_unique_edges, edge_hashmap);
@@ -305,7 +306,8 @@ std::tuple<IdArray, IdArray, IdArray> FrequencyHashmap<IdxType>::Topk(
  IdxType *unique_output_offsets = (num_unique_each_node_data + 2 * (num_dst_nodes + 1));
  // 1. Scan the all edges and count the unique edges and unique edges for each dst node
-  _count_frequency<IdxType, BLOCK_SIZE, TILE_SIZE><<<edges_grid, block, 0, _stream>>>(
+  CUDA_KERNEL_CALL((_count_frequency<IdxType, BLOCK_SIZE, TILE_SIZE>),
+      edges_grid, block, 0, _stream,
      src_data, num_edges, num_edges_per_node,
      edge_blocks_prefix, is_first_position, *_device_edge_hashmap);
@@ -337,7 +339,8 @@ std::tuple<IdArray, IdArray, IdArray> FrequencyHashmap<IdxType>::Topk(
  Idx64Type *unique_frequency = unique_frequency_data;
  Idx64Type *unique_frequency_alternate = unique_frequency_data + num_unique_edges;
  // 2.3 Compact the unique edges and their frequency
-  _compact_frequency<IdxType, Idx64Type, BLOCK_SIZE, TILE_SIZE><<<edges_grid, block, 0, _stream>>>(
+  CUDA_KERNEL_CALL((_compact_frequency<IdxType, Idx64Type, BLOCK_SIZE, TILE_SIZE>),
+      edges_grid, block, 0, _stream,
      src_data, dst_data, num_edges, num_edges_per_node,
      edge_blocks_prefix, is_first_position, num_unique_each_node,
      unique_src_edges, unique_frequency, *_device_edge_hashmap);
@@ -384,7 +387,8 @@ std::tuple<IdArray, IdArray, IdArray> FrequencyHashmap<IdxType>::Topk(
  // 4.1 Reset the min(num_pick, num_unique_each_node) to num_unique_each_node
  constexpr int NODE_TILE_SIZE  = BLOCK_SIZE * 2;
  const dim3 nodes_grid((num_dst_nodes + NODE_TILE_SIZE - 1) / NODE_TILE_SIZE);
-  _get_pick_num<IdxType, BLOCK_SIZE, NODE_TILE_SIZE><<<nodes_grid, block, 0, _stream>>>(
+  CUDA_KERNEL_CALL((_get_pick_num<IdxType, BLOCK_SIZE, NODE_TILE_SIZE>),
+      nodes_grid, block, 0, _stream,
      num_unique_each_node, num_pick, num_dst_nodes);
  // 4.2 ExclusiveSum the new num_unique_each_node as unique_output_offsets
  // use unique_output_offsets;
@@ -411,7 +415,8 @@ std::tuple<IdArray, IdArray, IdArray> FrequencyHashmap<IdxType>::Topk(
      dtype, _ctx);
  IdArray res_cnt = IdArray::Empty({static_cast<int64_t>(num_output)},
      dtype, _ctx);
-  _pick_data<IdxType, Idx64Type, BLOCK_SIZE, NODE_TILE_SIZE><<<nodes_grid, block, 0, _stream>>>(
+  CUDA_KERNEL_CALL((_pick_data<IdxType, Idx64Type, BLOCK_SIZE, NODE_TILE_SIZE>),
+      nodes_grid, block, 0, _stream,
      d_unique_frequency.Current(), d_unique_src_edges.Current(), num_unique_each_node_alternate,
      dst_data, num_edges_per_node, num_dst_nodes, num_edges,
      unique_output_offsets,

--- a/src/graph/transform/cuda/cuda_map_edges.cuh
+++ b/src/graph/transform/cuda/cuda_map_edges.cuh
@@ -239,11 +239,8 @@ MapEdges(
      const dim3 block(BLOCK_SIZE);
      // map the srcs
-      map_edge_ids<IdType, BLOCK_SIZE, TILE_SIZE><<<
+      CUDA_KERNEL_CALL((map_edge_ids<IdType, BLOCK_SIZE, TILE_SIZE>),
-        grid,
+        grid, block, 0, stream,
-        block,
-        0,
-        stream>>>(
        edges.src.Ptr<IdType>(),
        new_lhs.back().Ptr<IdType>(),
        edges.dst.Ptr<IdType>(),
@@ -251,7 +248,6 @@ MapEdges(
        num_edges,
        node_map.LhsHashTable(src_type).DeviceHandle(),
        node_map.RhsHashTable(dst_type).DeviceHandle());
-      CUDA_CALL(cudaGetLastError());
    } else {
      new_lhs.emplace_back(
          aten::NullArray(DLDataType{kDLInt, sizeof(IdType)*8, 1}, ctx));

--- a/src/runtime/cuda/cuda_hashtable.cu
+++ b/src/runtime/cuda/cuda_hashtable.cu
@@ -6,6 +6,7 @@
 #include <cassert>
+#include "cuda_common.h"
 #include "cuda_hashtable.cuh"
 #include "../../array/cuda/atomic.cuh"
 #include "../../array/cuda/dgl_cub.cuh"
@@ -416,21 +417,21 @@ void OrderedHashTable<IdType>::FillWithDuplicates(
  auto device_table = MutableDeviceOrderedHashTable<IdType>(this);
-  generate_hashmap_duplicates<IdType, BLOCK_SIZE, TILE_SIZE><<<grid, block, 0, stream>>>(
+  CUDA_KERNEL_CALL((generate_hashmap_duplicates<IdType, BLOCK_SIZE, TILE_SIZE>),
+      grid, block, 0, stream,
      input,
      num_input,
      device_table);
-  CUDA_CALL(cudaGetLastError());
  IdType * item_prefix = static_cast<IdType*>(
      device->AllocWorkspace(ctx_, sizeof(IdType)*(num_input+1)));
-  count_hashmap<IdType, BLOCK_SIZE, TILE_SIZE><<<grid, block, 0, stream>>>(
+  CUDA_KERNEL_CALL((count_hashmap<IdType, BLOCK_SIZE, TILE_SIZE>),
+      grid, block, 0, stream,
      input,
      num_input,
      device_table,
      item_prefix);
-  CUDA_CALL(cudaGetLastError());
  size_t workspace_bytes;
  CUDA_CALL(cub::DeviceScan::ExclusiveSum(
@@ -449,14 +450,14 @@ void OrderedHashTable<IdType>::FillWithDuplicates(
      grid.x+1, stream));
  device->FreeWorkspace(ctx_, workspace);
-  compact_hashmap<IdType, BLOCK_SIZE, TILE_SIZE><<<grid, block, 0, stream>>>(
+  CUDA_KERNEL_CALL((compact_hashmap<IdType, BLOCK_SIZE, TILE_SIZE>),
+      grid, block, 0, stream,
      input,
      num_input,
      device_table,
      item_prefix,
      unique,
      num_unique);
-  CUDA_CALL(cudaGetLastError());
  device->FreeWorkspace(ctx_, item_prefix);
 }
@@ -473,11 +474,11 @@ void OrderedHashTable<IdType>::FillWithUnique(
  auto device_table = MutableDeviceOrderedHashTable<IdType>(this);
-  generate_hashmap_unique<IdType, BLOCK_SIZE, TILE_SIZE><<<grid, block, 0, stream>>>(
+  CUDA_KERNEL_CALL((generate_hashmap_unique<IdType, BLOCK_SIZE, TILE_SIZE>),
+      grid, block, 0, stream,
      input,
      num_input,
      device_table);
-  CUDA_CALL(cudaGetLastError());
 }
 template class OrderedHashTable<int32_t>;

--- a/src/runtime/cuda/nccl_api.cu
+++ b/src/runtime/cuda/nccl_api.cu
@@ -193,7 +193,8 @@ std::pair<IdArray, NDArray> SparsePush(
    const dim3 block(256);
    const dim3 grid((num_in+block.x-1)/block.x);
-    _DualPermKernel<<<grid, block, 0, stream>>>(
+    CUDA_KERNEL_CALL(_DualPermKernel,
+        grid, block, 0, stream,
        static_cast<const IdType*>(in_idx->data),
        static_cast<const DType*>(in_value->data),
        perm,
@@ -201,7 +202,6 @@ std::pair<IdArray, NDArray> SparsePush(
        num_feat,
        send_idx.get(),
        send_value.get());
-    CUDA_CALL(cudaGetLastError());
  }
  // compute the prefix sum of the send values
@@ -346,13 +346,13 @@ NDArray SparsePull(
    const dim3 block(256);
    const dim3 grid((num_in+block.x-1)/block.x);
-    aten::impl::IndexSelectSingleKernel<<<grid, block, 0, stream>>>(
+    CUDA_KERNEL_CALL(aten::impl::IndexSelectSingleKernel,
+        grid, block, 0, stream,
        static_cast<const IdType*>(req_idx->data),
        perm,
        num_in,
        req_idx->shape[0],
        send_idx.get());
-    CUDA_CALL(cudaGetLastError());
  }
  // compute the prefix sum of the indexes this process is requesting
@@ -453,14 +453,14 @@ NDArray SparsePull(
    }
    const dim3 grid((response_prefix_host.back()+block.y-1)/block.y);
-    aten::impl::IndexSelectMultiKernel<<<grid, block, 0, stream>>>(
+    CUDA_KERNEL_CALL(aten::impl::IndexSelectMultiKernel,
+        grid, block, 0, stream,
        static_cast<const DType*>(local_tensor->data),
        num_feat,
        static_cast<IdType*>(recv_idx->data),
        response_prefix_host.back(),
        local_tensor->shape[0],
        filled_response_value.get());
-    CUDA_CALL(cudaGetLastError());
  }
  // we will collect recieved values in this array
@@ -499,13 +499,13 @@ NDArray SparsePull(
    }
    const dim3 grid((num_in+block.y-1)/block.y);
-    _InversePermKernel<<<grid, block, 0, stream>>>(
+    CUDA_KERNEL_CALL(_InversePermKernel,
+        grid, block, 0, stream,
        filled_request_value.get(),
        num_feat,
        num_in,
        perm,
        static_cast<DType*>(result->data));
-    CUDA_CALL(cudaGetLastError());
  }
  return result;