Unverified Commit 6014623d authored by Xin Yao's avatar Xin Yao Committed by GitHub
Browse files

wrap all cuda kernel calls with macro (#4066)


Co-authored-by: default avatarnv-dlasalle <63612878+nv-dlasalle@users.noreply.github.com>
Co-authored-by: default avatarIsrat Nisa <neesha295@gmail.com>
parent a3ea4873
...@@ -6,6 +6,7 @@ ...@@ -6,6 +6,7 @@
#include <dgl/runtime/device_api.h> #include <dgl/runtime/device_api.h>
#include "../../runtime/cuda/cuda_common.h"
#include "../filter.h" #include "../filter.h"
#include "../../runtime/cuda/cuda_hashtable.cuh" #include "../../runtime/cuda/cuda_hashtable.cuh"
#include "./dgl_cub.cuh" #include "./dgl_cub.cuh"
...@@ -74,12 +75,12 @@ IdArray _PerformFilter( ...@@ -74,12 +75,12 @@ IdArray _PerformFilter(
const dim3 block(256); const dim3 block(256);
const dim3 grid((size+block.x-1)/block.x); const dim3 grid((size+block.x-1)/block.x);
_IsInKernel<IdType, include><<<grid, block, 0, stream>>>( CUDA_KERNEL_CALL((_IsInKernel<IdType, include>),
grid, block, 0, stream,
table.DeviceHandle(), table.DeviceHandle(),
static_cast<const IdType*>(test->data), static_cast<const IdType*>(test->data),
size, size,
prefix); prefix);
CUDA_CALL(cudaGetLastError());
} }
// generate prefix-sum // generate prefix-sum
...@@ -117,11 +118,11 @@ IdArray _PerformFilter( ...@@ -117,11 +118,11 @@ IdArray _PerformFilter(
const dim3 block(256); const dim3 block(256);
const dim3 grid((size+block.x-1)/block.x); const dim3 grid((size+block.x-1)/block.x);
_InsertKernel<<<grid, block, 0, stream>>>( CUDA_KERNEL_CALL(_InsertKernel,
grid, block, 0, stream,
prefix, prefix,
size, size,
static_cast<IdType*>(result->data)); static_cast<IdType*>(result->data));
CUDA_CALL(cudaGetLastError());
} }
device->FreeWorkspace(ctx, prefix); device->FreeWorkspace(ctx, prefix);
......
...@@ -251,7 +251,8 @@ FrequencyHashmap<IdxType>::FrequencyHashmap( ...@@ -251,7 +251,8 @@ FrequencyHashmap<IdxType>::FrequencyHashmap(
dim3 block(BLOCK_SIZE); dim3 block(BLOCK_SIZE);
dim3 grid((num_dst * num_items_each_dst + TILE_SIZE - 1) / TILE_SIZE); dim3 grid((num_dst * num_items_each_dst + TILE_SIZE - 1) / TILE_SIZE);
cudaMemset(dst_unique_edges, 0, (num_dst) * sizeof(IdxType)); cudaMemset(dst_unique_edges, 0, (num_dst) * sizeof(IdxType));
_init_edge_table<IdxType, BLOCK_SIZE, TILE_SIZE><<<grid, block, 0, _stream>>>( CUDA_KERNEL_CALL((_init_edge_table<IdxType, BLOCK_SIZE, TILE_SIZE>),
grid, block, 0, _stream,
edge_hashmap, (num_dst * num_items_each_dst)); edge_hashmap, (num_dst * num_items_each_dst));
_device_edge_hashmap = new DeviceEdgeHashmap<IdxType>( _device_edge_hashmap = new DeviceEdgeHashmap<IdxType>(
num_dst, num_items_each_dst, dst_unique_edges, edge_hashmap); num_dst, num_items_each_dst, dst_unique_edges, edge_hashmap);
...@@ -305,7 +306,8 @@ std::tuple<IdArray, IdArray, IdArray> FrequencyHashmap<IdxType>::Topk( ...@@ -305,7 +306,8 @@ std::tuple<IdArray, IdArray, IdArray> FrequencyHashmap<IdxType>::Topk(
IdxType *unique_output_offsets = (num_unique_each_node_data + 2 * (num_dst_nodes + 1)); IdxType *unique_output_offsets = (num_unique_each_node_data + 2 * (num_dst_nodes + 1));
// 1. Scan the all edges and count the unique edges and unique edges for each dst node // 1. Scan the all edges and count the unique edges and unique edges for each dst node
_count_frequency<IdxType, BLOCK_SIZE, TILE_SIZE><<<edges_grid, block, 0, _stream>>>( CUDA_KERNEL_CALL((_count_frequency<IdxType, BLOCK_SIZE, TILE_SIZE>),
edges_grid, block, 0, _stream,
src_data, num_edges, num_edges_per_node, src_data, num_edges, num_edges_per_node,
edge_blocks_prefix, is_first_position, *_device_edge_hashmap); edge_blocks_prefix, is_first_position, *_device_edge_hashmap);
...@@ -337,7 +339,8 @@ std::tuple<IdArray, IdArray, IdArray> FrequencyHashmap<IdxType>::Topk( ...@@ -337,7 +339,8 @@ std::tuple<IdArray, IdArray, IdArray> FrequencyHashmap<IdxType>::Topk(
Idx64Type *unique_frequency = unique_frequency_data; Idx64Type *unique_frequency = unique_frequency_data;
Idx64Type *unique_frequency_alternate = unique_frequency_data + num_unique_edges; Idx64Type *unique_frequency_alternate = unique_frequency_data + num_unique_edges;
// 2.3 Compact the unique edges and their frequency // 2.3 Compact the unique edges and their frequency
_compact_frequency<IdxType, Idx64Type, BLOCK_SIZE, TILE_SIZE><<<edges_grid, block, 0, _stream>>>( CUDA_KERNEL_CALL((_compact_frequency<IdxType, Idx64Type, BLOCK_SIZE, TILE_SIZE>),
edges_grid, block, 0, _stream,
src_data, dst_data, num_edges, num_edges_per_node, src_data, dst_data, num_edges, num_edges_per_node,
edge_blocks_prefix, is_first_position, num_unique_each_node, edge_blocks_prefix, is_first_position, num_unique_each_node,
unique_src_edges, unique_frequency, *_device_edge_hashmap); unique_src_edges, unique_frequency, *_device_edge_hashmap);
...@@ -384,7 +387,8 @@ std::tuple<IdArray, IdArray, IdArray> FrequencyHashmap<IdxType>::Topk( ...@@ -384,7 +387,8 @@ std::tuple<IdArray, IdArray, IdArray> FrequencyHashmap<IdxType>::Topk(
// 4.1 Reset the min(num_pick, num_unique_each_node) to num_unique_each_node // 4.1 Reset the min(num_pick, num_unique_each_node) to num_unique_each_node
constexpr int NODE_TILE_SIZE = BLOCK_SIZE * 2; constexpr int NODE_TILE_SIZE = BLOCK_SIZE * 2;
const dim3 nodes_grid((num_dst_nodes + NODE_TILE_SIZE - 1) / NODE_TILE_SIZE); const dim3 nodes_grid((num_dst_nodes + NODE_TILE_SIZE - 1) / NODE_TILE_SIZE);
_get_pick_num<IdxType, BLOCK_SIZE, NODE_TILE_SIZE><<<nodes_grid, block, 0, _stream>>>( CUDA_KERNEL_CALL((_get_pick_num<IdxType, BLOCK_SIZE, NODE_TILE_SIZE>),
nodes_grid, block, 0, _stream,
num_unique_each_node, num_pick, num_dst_nodes); num_unique_each_node, num_pick, num_dst_nodes);
// 4.2 ExclusiveSum the new num_unique_each_node as unique_output_offsets // 4.2 ExclusiveSum the new num_unique_each_node as unique_output_offsets
// use unique_output_offsets; // use unique_output_offsets;
...@@ -411,7 +415,8 @@ std::tuple<IdArray, IdArray, IdArray> FrequencyHashmap<IdxType>::Topk( ...@@ -411,7 +415,8 @@ std::tuple<IdArray, IdArray, IdArray> FrequencyHashmap<IdxType>::Topk(
dtype, _ctx); dtype, _ctx);
IdArray res_cnt = IdArray::Empty({static_cast<int64_t>(num_output)}, IdArray res_cnt = IdArray::Empty({static_cast<int64_t>(num_output)},
dtype, _ctx); dtype, _ctx);
_pick_data<IdxType, Idx64Type, BLOCK_SIZE, NODE_TILE_SIZE><<<nodes_grid, block, 0, _stream>>>( CUDA_KERNEL_CALL((_pick_data<IdxType, Idx64Type, BLOCK_SIZE, NODE_TILE_SIZE>),
nodes_grid, block, 0, _stream,
d_unique_frequency.Current(), d_unique_src_edges.Current(), num_unique_each_node_alternate, d_unique_frequency.Current(), d_unique_src_edges.Current(), num_unique_each_node_alternate,
dst_data, num_edges_per_node, num_dst_nodes, num_edges, dst_data, num_edges_per_node, num_dst_nodes, num_edges,
unique_output_offsets, unique_output_offsets,
......
...@@ -239,11 +239,8 @@ MapEdges( ...@@ -239,11 +239,8 @@ MapEdges(
const dim3 block(BLOCK_SIZE); const dim3 block(BLOCK_SIZE);
// map the srcs // map the srcs
map_edge_ids<IdType, BLOCK_SIZE, TILE_SIZE><<< CUDA_KERNEL_CALL((map_edge_ids<IdType, BLOCK_SIZE, TILE_SIZE>),
grid, grid, block, 0, stream,
block,
0,
stream>>>(
edges.src.Ptr<IdType>(), edges.src.Ptr<IdType>(),
new_lhs.back().Ptr<IdType>(), new_lhs.back().Ptr<IdType>(),
edges.dst.Ptr<IdType>(), edges.dst.Ptr<IdType>(),
...@@ -251,7 +248,6 @@ MapEdges( ...@@ -251,7 +248,6 @@ MapEdges(
num_edges, num_edges,
node_map.LhsHashTable(src_type).DeviceHandle(), node_map.LhsHashTable(src_type).DeviceHandle(),
node_map.RhsHashTable(dst_type).DeviceHandle()); node_map.RhsHashTable(dst_type).DeviceHandle());
CUDA_CALL(cudaGetLastError());
} else { } else {
new_lhs.emplace_back( new_lhs.emplace_back(
aten::NullArray(DLDataType{kDLInt, sizeof(IdType)*8, 1}, ctx)); aten::NullArray(DLDataType{kDLInt, sizeof(IdType)*8, 1}, ctx));
......
...@@ -6,6 +6,7 @@ ...@@ -6,6 +6,7 @@
#include <cassert> #include <cassert>
#include "cuda_common.h"
#include "cuda_hashtable.cuh" #include "cuda_hashtable.cuh"
#include "../../array/cuda/atomic.cuh" #include "../../array/cuda/atomic.cuh"
#include "../../array/cuda/dgl_cub.cuh" #include "../../array/cuda/dgl_cub.cuh"
...@@ -416,21 +417,21 @@ void OrderedHashTable<IdType>::FillWithDuplicates( ...@@ -416,21 +417,21 @@ void OrderedHashTable<IdType>::FillWithDuplicates(
auto device_table = MutableDeviceOrderedHashTable<IdType>(this); auto device_table = MutableDeviceOrderedHashTable<IdType>(this);
generate_hashmap_duplicates<IdType, BLOCK_SIZE, TILE_SIZE><<<grid, block, 0, stream>>>( CUDA_KERNEL_CALL((generate_hashmap_duplicates<IdType, BLOCK_SIZE, TILE_SIZE>),
grid, block, 0, stream,
input, input,
num_input, num_input,
device_table); device_table);
CUDA_CALL(cudaGetLastError());
IdType * item_prefix = static_cast<IdType*>( IdType * item_prefix = static_cast<IdType*>(
device->AllocWorkspace(ctx_, sizeof(IdType)*(num_input+1))); device->AllocWorkspace(ctx_, sizeof(IdType)*(num_input+1)));
count_hashmap<IdType, BLOCK_SIZE, TILE_SIZE><<<grid, block, 0, stream>>>( CUDA_KERNEL_CALL((count_hashmap<IdType, BLOCK_SIZE, TILE_SIZE>),
grid, block, 0, stream,
input, input,
num_input, num_input,
device_table, device_table,
item_prefix); item_prefix);
CUDA_CALL(cudaGetLastError());
size_t workspace_bytes; size_t workspace_bytes;
CUDA_CALL(cub::DeviceScan::ExclusiveSum( CUDA_CALL(cub::DeviceScan::ExclusiveSum(
...@@ -449,14 +450,14 @@ void OrderedHashTable<IdType>::FillWithDuplicates( ...@@ -449,14 +450,14 @@ void OrderedHashTable<IdType>::FillWithDuplicates(
grid.x+1, stream)); grid.x+1, stream));
device->FreeWorkspace(ctx_, workspace); device->FreeWorkspace(ctx_, workspace);
compact_hashmap<IdType, BLOCK_SIZE, TILE_SIZE><<<grid, block, 0, stream>>>( CUDA_KERNEL_CALL((compact_hashmap<IdType, BLOCK_SIZE, TILE_SIZE>),
grid, block, 0, stream,
input, input,
num_input, num_input,
device_table, device_table,
item_prefix, item_prefix,
unique, unique,
num_unique); num_unique);
CUDA_CALL(cudaGetLastError());
device->FreeWorkspace(ctx_, item_prefix); device->FreeWorkspace(ctx_, item_prefix);
} }
...@@ -473,11 +474,11 @@ void OrderedHashTable<IdType>::FillWithUnique( ...@@ -473,11 +474,11 @@ void OrderedHashTable<IdType>::FillWithUnique(
auto device_table = MutableDeviceOrderedHashTable<IdType>(this); auto device_table = MutableDeviceOrderedHashTable<IdType>(this);
generate_hashmap_unique<IdType, BLOCK_SIZE, TILE_SIZE><<<grid, block, 0, stream>>>( CUDA_KERNEL_CALL((generate_hashmap_unique<IdType, BLOCK_SIZE, TILE_SIZE>),
grid, block, 0, stream,
input, input,
num_input, num_input,
device_table); device_table);
CUDA_CALL(cudaGetLastError());
} }
template class OrderedHashTable<int32_t>; template class OrderedHashTable<int32_t>;
......
...@@ -193,7 +193,8 @@ std::pair<IdArray, NDArray> SparsePush( ...@@ -193,7 +193,8 @@ std::pair<IdArray, NDArray> SparsePush(
const dim3 block(256); const dim3 block(256);
const dim3 grid((num_in+block.x-1)/block.x); const dim3 grid((num_in+block.x-1)/block.x);
_DualPermKernel<<<grid, block, 0, stream>>>( CUDA_KERNEL_CALL(_DualPermKernel,
grid, block, 0, stream,
static_cast<const IdType*>(in_idx->data), static_cast<const IdType*>(in_idx->data),
static_cast<const DType*>(in_value->data), static_cast<const DType*>(in_value->data),
perm, perm,
...@@ -201,7 +202,6 @@ std::pair<IdArray, NDArray> SparsePush( ...@@ -201,7 +202,6 @@ std::pair<IdArray, NDArray> SparsePush(
num_feat, num_feat,
send_idx.get(), send_idx.get(),
send_value.get()); send_value.get());
CUDA_CALL(cudaGetLastError());
} }
// compute the prefix sum of the send values // compute the prefix sum of the send values
...@@ -346,13 +346,13 @@ NDArray SparsePull( ...@@ -346,13 +346,13 @@ NDArray SparsePull(
const dim3 block(256); const dim3 block(256);
const dim3 grid((num_in+block.x-1)/block.x); const dim3 grid((num_in+block.x-1)/block.x);
aten::impl::IndexSelectSingleKernel<<<grid, block, 0, stream>>>( CUDA_KERNEL_CALL(aten::impl::IndexSelectSingleKernel,
grid, block, 0, stream,
static_cast<const IdType*>(req_idx->data), static_cast<const IdType*>(req_idx->data),
perm, perm,
num_in, num_in,
req_idx->shape[0], req_idx->shape[0],
send_idx.get()); send_idx.get());
CUDA_CALL(cudaGetLastError());
} }
// compute the prefix sum of the indexes this process is requesting // compute the prefix sum of the indexes this process is requesting
...@@ -453,14 +453,14 @@ NDArray SparsePull( ...@@ -453,14 +453,14 @@ NDArray SparsePull(
} }
const dim3 grid((response_prefix_host.back()+block.y-1)/block.y); const dim3 grid((response_prefix_host.back()+block.y-1)/block.y);
aten::impl::IndexSelectMultiKernel<<<grid, block, 0, stream>>>( CUDA_KERNEL_CALL(aten::impl::IndexSelectMultiKernel,
grid, block, 0, stream,
static_cast<const DType*>(local_tensor->data), static_cast<const DType*>(local_tensor->data),
num_feat, num_feat,
static_cast<IdType*>(recv_idx->data), static_cast<IdType*>(recv_idx->data),
response_prefix_host.back(), response_prefix_host.back(),
local_tensor->shape[0], local_tensor->shape[0],
filled_response_value.get()); filled_response_value.get());
CUDA_CALL(cudaGetLastError());
} }
// we will collect recieved values in this array // we will collect recieved values in this array
...@@ -499,13 +499,13 @@ NDArray SparsePull( ...@@ -499,13 +499,13 @@ NDArray SparsePull(
} }
const dim3 grid((num_in+block.y-1)/block.y); const dim3 grid((num_in+block.y-1)/block.y);
_InversePermKernel<<<grid, block, 0, stream>>>( CUDA_KERNEL_CALL(_InversePermKernel,
grid, block, 0, stream,
filled_request_value.get(), filled_request_value.get(),
num_feat, num_feat,
num_in, num_in,
perm, perm,
static_cast<DType*>(result->data)); static_cast<DType*>(result->data));
CUDA_CALL(cudaGetLastError());
} }
return result; return result;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment