Unverified Commit 2c04ecb5 authored by Zihao Ye's avatar Zihao Ye Committed by GitHub
Browse files

[hotfix] Skip CUDA kernel launch when number of blocks/threads is zero. (#2144)



* upd

* upd

* upd

* upd

* lint

* upd

* upd

* fmt
Co-authored-by: default avatarQuan (Andy) Gan <coin2028@hotmail.com>
parent 5f44a4ef
......@@ -36,7 +36,8 @@ NDArray IndexSelect(NDArray array, IdArray index) {
DType* ret_data = static_cast<DType*>(ret->data);
const int nt = cuda::FindNumThreads(len);
const int nb = (len + nt - 1) / nt;
_IndexSelectKernel<<<nb, nt, 0, thr_entry->stream>>>(array_data, idx_data, len, ret_data);
CUDA_KERNEL_CALL(_IndexSelectKernel, nb, nt, 0, thr_entry->stream,
array_data, idx_data, len, ret_data);
return ret;
}
......
......@@ -36,7 +36,8 @@ IdArray BinaryElewise(IdArray lhs, IdArray rhs) {
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
int nt = cuda::FindNumThreads(len);
int nb = (len + nt - 1) / nt;
_BinaryElewiseKernel<IdType, Op><<<nb, nt, 0, thr_entry->stream>>>(
CUDA_KERNEL_CALL((_BinaryElewiseKernel<IdType, Op>),
nb, nt, 0, thr_entry->stream,
lhs_data, rhs_data, ret_data, len);
return ret;
}
......@@ -85,7 +86,8 @@ IdArray BinaryElewise(IdArray lhs, IdType rhs) {
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
int nt = cuda::FindNumThreads(len);
int nb = (len + nt - 1) / nt;
_BinaryElewiseKernel<IdType, Op><<<nb, nt, 0, thr_entry->stream>>>(
CUDA_KERNEL_CALL((_BinaryElewiseKernel<IdType, Op>),
nb, nt, 0, thr_entry->stream,
lhs_data, rhs, ret_data, len);
return ret;
}
......@@ -135,7 +137,8 @@ IdArray BinaryElewise(IdType lhs, IdArray rhs) {
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
int nt = cuda::FindNumThreads(len);
int nb = (len + nt - 1) / nt;
_BinaryElewiseKernel<IdType, Op><<<nb, nt, 0, thr_entry->stream>>>(
CUDA_KERNEL_CALL((_BinaryElewiseKernel<IdType, Op>),
nb, nt, 0, thr_entry->stream,
lhs, rhs_data, ret_data, len);
return ret;
}
......@@ -183,7 +186,8 @@ IdArray UnaryElewise(IdArray lhs) {
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
int nt = cuda::FindNumThreads(len);
int nb = (len + nt - 1) / nt;
_UnaryElewiseKernel<IdType, Op><<<nb, nt, 0, thr_entry->stream>>>(
CUDA_KERNEL_CALL((_UnaryElewiseKernel<IdType, Op>),
nb, nt, 0, thr_entry->stream,
lhs_data, ret_data, len);
return ret;
}
......@@ -211,7 +215,8 @@ IdArray Full(IdType val, int64_t length, DLContext ctx) {
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
int nt = cuda::FindNumThreads(length);
int nb = (length + nt - 1) / nt;
_FullKernel<IdType><<<nb, nt, 0, thr_entry->stream>>>(ret_data, length, val);
CUDA_KERNEL_CALL((_FullKernel<IdType>), nb, nt, 0, thr_entry->stream,
ret_data, length, val);
return ret;
}
......@@ -242,7 +247,9 @@ IdArray Range(IdType low, IdType high, DLContext ctx) {
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
int nt = cuda::FindNumThreads(length);
int nb = (length + nt - 1) / nt;
_RangeKernel<IdType><<<nb, nt, 0, thr_entry->stream>>>(ret_data, low, length);
CUDA_KERNEL_CALL((_RangeKernel<IdType>),
nb, nt, 0, thr_entry->stream,
ret_data, low, length);
return ret;
}
......@@ -270,10 +277,12 @@ IdArray AsNumBits(IdArray arr, uint8_t bits) {
int nt = cuda::FindNumThreads(length);
int nb = (length + nt - 1) / nt;
if (bits == 32) {
_CastKernel<IdType, int32_t><<<nb, nt, 0, thr_entry->stream>>>(
CUDA_KERNEL_CALL((_CastKernel<IdType, int32_t>),
nb, nt, 0, thr_entry->stream,
static_cast<IdType*>(arr->data), static_cast<int32_t*>(ret->data), length);
} else {
_CastKernel<IdType, int64_t><<<nb, nt, 0, thr_entry->stream>>>(
CUDA_KERNEL_CALL((_CastKernel<IdType, int64_t>),
nb, nt, 0, thr_entry->stream,
static_cast<IdType*>(arr->data), static_cast<int64_t*>(ret->data), length);
}
return ret;
......
......@@ -33,7 +33,8 @@ void Scatter_(IdArray index, NDArray value, NDArray out) {
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
const int nt = cuda::FindNumThreads(len);
const int nb = (len + nt - 1) / nt;
_ScatterKernel<<<nb, nt, 0, thr_entry->stream>>>(idx, val, len, outd);
CUDA_KERNEL_CALL(_ScatterKernel, nb, nt, 0, thr_entry->stream,
idx, val, len, outd);
}
template void Scatter_<kDLGPU, int32_t, int32_t>(IdArray, NDArray, NDArray);
......
......@@ -131,7 +131,8 @@ CSRMatrix COOToCSR<kDLGPU, int64_t>(COOMatrix coo) {
const int nt = cuda::FindNumThreads(coo.num_rows);
const int nb = (coo.num_rows + nt - 1) / nt;
IdArray indptr = Full(0, coo.num_rows + 1, nbits, ctx);
_SortedSearchKernelUpperBound<<<nb, nt, 0, thr_entry->stream>>>(
CUDA_KERNEL_CALL(_SortedSearchKernelUpperBound,
nb, nt, 0, thr_entry->stream,
coo.row.Ptr<int64_t>(), nnz,
rowids.Ptr<int64_t>(), coo.num_rows,
indptr.Ptr<int64_t>() + 1);
......
......@@ -155,7 +155,7 @@ std::pair<bool, bool> COOIsSorted(COOMatrix coo) {
int8_t* col_flags = static_cast<int8_t*>(device->AllocWorkspace(ctx, nnz));
const int nt = cuda::FindNumThreads(nnz);
const int nb = (nnz + nt - 1) / nt;
_COOIsSortedKernel<<<nb, nt, 0, thr_entry->stream>>>(
CUDA_KERNEL_CALL(_COOIsSortedKernel, nb, nt, 0, thr_entry->stream,
coo.row.Ptr<IdType>(), coo.col.Ptr<IdType>(),
nnz, row_flags, col_flags);
......
......@@ -91,7 +91,8 @@ COOMatrix CSRToCOO<kDLGPU, int64_t>(CSRMatrix csr) {
const int nt = cuda::FindNumThreads(csr.num_rows);
const int nb = (csr.num_rows + nt - 1) / nt;
_RepeatKernel<<<nb, nt, 0, thr_entry->stream>>>(
CUDA_KERNEL_CALL(_RepeatKernel,
nb, nt, 0, thr_entry->stream,
rowids.Ptr<int64_t>(), row_nnz.Ptr<int64_t>(),
csr.indptr.Ptr<int64_t>(), ret_row.Ptr<int64_t>(),
csr.num_rows);
......
......@@ -44,7 +44,8 @@ bool CSRIsSorted(CSRMatrix csr) {
int8_t* flags = static_cast<int8_t*>(device->AllocWorkspace(ctx, csr.num_rows));
const int nt = cuda::FindNumThreads(csr.num_rows);
const int nb = (csr.num_rows + nt - 1) / nt;
_SegmentIsSorted<<<nb, nt, 0, thr_entry->stream>>>(
CUDA_KERNEL_CALL(_SegmentIsSorted,
nb, nt, 0, thr_entry->stream,
csr.indptr.Ptr<IdType>(), csr.indices.Ptr<IdType>(),
csr.num_rows, flags);
bool ret = cuda::AllTrue(flags, csr.num_rows, ctx);
......
......@@ -182,14 +182,13 @@ void SDDMMCoo(
const bool use_idx = !IsNullArray(coo.data);
BCAST_IDX_CTX_SWITCH(bcast, use_idx, out->ctx, lhs_off, rhs_off, {
SDDMMCooKernel<Idx, DType, Op, UseBcast, UseIdx, LhsTarget, RhsTarget>
<<<nblks, nthrs, 0, thr_entry->stream>>>(
CUDA_KERNEL_CALL((SDDMMCooKernel<Idx, DType, Op, UseBcast, UseIdx, LhsTarget, RhsTarget>),
nblks, nthrs, 0, thr_entry->stream,
lhs_data, rhs_data, out_data,
row, col, edge_map,
coo.num_rows, coo.num_cols, nnz, reduce_dim,
lhs_off, rhs_off,
lhs_len, rhs_len, len
);
lhs_len, rhs_len, len);
});
}
......@@ -233,14 +232,13 @@ void SDDMMCsr(
const bool use_idx = !IsNullArray(csr.data);
BCAST_IDX_CTX_SWITCH(bcast, use_idx, out->ctx, lhs_off, rhs_off, {
SDDMMCsrKernel<Idx, DType, Op, UseBcast, UseIdx, LhsTarget, RhsTarget>
<<<nblks, nthrs, 0, thr_entry->stream>>>(
CUDA_KERNEL_CALL((SDDMMCsrKernel<Idx, DType, Op, UseBcast, UseIdx, LhsTarget, RhsTarget>),
nblks, nthrs, 0, thr_entry->stream,
lhs_data, rhs_data, out_data,
indptr, indices, edge_map,
N, M, E, reduce_dim,
lhs_off, rhs_off,
lhs_len, rhs_len, len
);
lhs_len, rhs_len, len);
});
}
......
......@@ -61,7 +61,8 @@ bool CSRIsNonZero(CSRMatrix csr, int64_t row, int64_t col) {
IdArray out = aten::NewIdArray(1, ctx, sizeof(IdType) * 8);
const IdType* data = nullptr;
// TODO(minjie): use binary search for sorted csr
_LinearSearchKernel<<<1, 1, 0, thr_entry->stream>>>(
CUDA_KERNEL_CALL(_LinearSearchKernel,
1, 1, 0, thr_entry->stream,
csr.indptr.Ptr<IdType>(), csr.indices.Ptr<IdType>(), data,
rows.Ptr<IdType>(), cols.Ptr<IdType>(),
1, 1, 1,
......@@ -88,7 +89,8 @@ NDArray CSRIsNonZero(CSRMatrix csr, NDArray row, NDArray col) {
const int nb = (rstlen + nt - 1) / nt;
const IdType* data = nullptr;
// TODO(minjie): use binary search for sorted csr
_LinearSearchKernel<<<nb, nt, 0, thr_entry->stream>>>(
CUDA_KERNEL_CALL(_LinearSearchKernel,
nb, nt, 0, thr_entry->stream,
csr.indptr.Ptr<IdType>(), csr.indices.Ptr<IdType>(), data,
row.Ptr<IdType>(), col.Ptr<IdType>(),
row_stride, col_stride, rstlen,
......@@ -134,7 +136,8 @@ bool CSRHasDuplicate(CSRMatrix csr) {
int8_t* flags = static_cast<int8_t*>(device->AllocWorkspace(ctx, csr.num_rows));
const int nt = cuda::FindNumThreads(csr.num_rows);
const int nb = (csr.num_rows + nt - 1) / nt;
_SegmentHasNoDuplicate<<<nb, nt, 0, thr_entry->stream>>>(
CUDA_KERNEL_CALL(_SegmentHasNoDuplicate,
nb, nt, 0, thr_entry->stream,
csr.indptr.Ptr<IdType>(), csr.indices.Ptr<IdType>(),
csr.num_rows, flags);
bool ret = cuda::AllTrue(flags, csr.num_rows, ctx);
......@@ -182,7 +185,8 @@ NDArray CSRGetRowNNZ(CSRMatrix csr, NDArray rows) {
IdType* rst_data = static_cast<IdType*>(rst->data);
const int nt = cuda::FindNumThreads(len);
const int nb = (len + nt - 1) / nt;
_CSRGetRowNNZKernel<<<nb, nt, 0, thr_entry->stream>>>(
CUDA_KERNEL_CALL(_CSRGetRowNNZKernel,
nb, nt, 0, thr_entry->stream,
vid_data, indptr_data, rst_data, len);
return rst;
}
......@@ -281,13 +285,15 @@ CSRMatrix CSRSliceRows(CSRMatrix csr, NDArray rows) {
const int nb = (len + nt - 1) / nt;
// Copy indices.
IdArray ret_indices = NDArray::Empty({nnz}, csr.indptr->dtype, csr.indptr->ctx);
_SegmentCopyKernel<<<nb, nt, 0, thr_entry->stream>>>(
CUDA_KERNEL_CALL(_SegmentCopyKernel,
nb, nt, 0, thr_entry->stream,
csr.indptr.Ptr<IdType>(), csr.indices.Ptr<IdType>(),
rows.Ptr<IdType>(), 1, len,
ret_indptr.Ptr<IdType>(), ret_indices.Ptr<IdType>());
// Copy data.
IdArray ret_data = NDArray::Empty({nnz}, csr.indptr->dtype, csr.indptr->ctx);
_SegmentCopyKernel<<<nb, nt, 0, thr_entry->stream>>>(
CUDA_KERNEL_CALL(_SegmentCopyKernel,
nb, nt, 0, thr_entry->stream,
csr.indptr.Ptr<IdType>(), CSRHasData(csr)? csr.data.Ptr<IdType>() : nullptr,
rows.Ptr<IdType>(), 1, len,
ret_indptr.Ptr<IdType>(), ret_data.Ptr<IdType>());
......@@ -321,7 +327,8 @@ IdArray CSRGetData(CSRMatrix csr, NDArray row, NDArray col) {
const int nt = cuda::FindNumThreads(rstlen);
const int nb = (rstlen + nt - 1) / nt;
// TODO(minjie): use binary search for sorted csr
_LinearSearchKernel<<<nb, nt, 0, thr_entry->stream>>>(
CUDA_KERNEL_CALL(_LinearSearchKernel,
nb, nt, 0, thr_entry->stream,
csr.indptr.Ptr<IdType>(), csr.indices.Ptr<IdType>(),
CSRHasData(csr)? csr.data.Ptr<IdType>() : nullptr,
row.Ptr<IdType>(), col.Ptr<IdType>(),
......@@ -422,7 +429,8 @@ std::vector<NDArray> CSRGetDataAndIndices(CSRMatrix csr, NDArray row, NDArray co
IdArray mask = Full(0, nnz, nbits, ctx);
const int nt = cuda::FindNumThreads(len);
const int nb = (len + nt - 1) / nt;
_SegmentMaskKernel<<<nb, nt, 0, thr_entry->stream>>>(
CUDA_KERNEL_CALL(_SegmentMaskKernel,
nb, nt, 0, thr_entry->stream,
csr.indptr.Ptr<IdType>(), csr.indices.Ptr<IdType>(),
row.Ptr<IdType>(), col.Ptr<IdType>(),
row_stride, col_stride, len,
......@@ -437,7 +445,8 @@ std::vector<NDArray> CSRGetDataAndIndices(CSRMatrix csr, NDArray row, NDArray co
IdArray ret_row = NewIdArray(idx->shape[0], ctx, nbits);
const int nt2 = cuda::FindNumThreads(idx->shape[0]);
const int nb2 = (idx->shape[0] + nt - 1) / nt;
_SortedSearchKernel<<<nb2, nt2, 0, thr_entry->stream>>>(
CUDA_KERNEL_CALL(_SortedSearchKernel,
nb2, nt2, 0, thr_entry->stream,
csr.indptr.Ptr<IdType>(), csr.num_rows,
idx.Ptr<IdType>(), idx->shape[0],
ret_row.Ptr<IdType>());
......@@ -512,7 +521,8 @@ CSRMatrix CSRSliceMatrix(CSRMatrix csr, runtime::NDArray rows, runtime::NDArray
IdArray count = NewIdArray(csr.num_rows, ctx, nbits);
const int nt = cuda::FindNumThreads(csr.num_rows);
const int nb = (csr.num_rows + nt - 1) / nt;
_SegmentMaskColKernel<<<nb, nt, 0, thr_entry->stream>>>(
CUDA_KERNEL_CALL(_SegmentMaskColKernel,
nb, nt, 0, thr_entry->stream,
csr.indptr.Ptr<IdType>(), csr.indices.Ptr<IdType>(), csr.num_rows,
cols.Ptr<IdType>(), cols->shape[0],
mask.Ptr<IdType>(), count.Ptr<IdType>());
......
......@@ -21,7 +21,7 @@ void _Fill(DType* ptr, size_t length, DType val) {
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
int nt = FindNumThreads(length);
int nb = (length + nt - 1) / nt; // on x-axis, no need to worry about upperbound.
cuda::_FillKernel<<<nb, nt, 0, thr_entry->stream>>>(ptr, length, val);
CUDA_KERNEL_CALL(cuda::_FillKernel, nb, nt, 0, thr_entry->stream, ptr, length, val);
}
} // namespace
......
......@@ -224,7 +224,8 @@ void SpMMCoo(
int64_t out_size = out.NumElements();
const int nt = FindNumThreads(out_size);
const int nb = (out_size + nt - 1) / nt;
_FillKernel<<<nb, nt, 0, thr_entry->stream>>>(out_data, out_size, ReduceOp::zero);
CUDA_KERNEL_CALL(_FillKernel, nb, nt, 0, thr_entry->stream,
out_data, out_size, ReduceOp::zero);
const int ntx = FindNumThreads(len);
const int nty = CUDA_MAX_NUM_THREADS / ntx;
......@@ -236,23 +237,21 @@ void SpMMCoo(
const bool use_idx = !IsNullArray(coo.data);
BCAST_IDX_CTX_SWITCH(bcast, use_idx, ufeat->ctx, ubcast_off, ebcast_off, {
SpMMCooKernel<Idx, DType, BinaryOp, ReduceOp, UseBcast, UseIdx>
<<<nblks, nthrs, 0, thr_entry->stream>>>(
CUDA_KERNEL_CALL((SpMMCooKernel<Idx, DType, BinaryOp, ReduceOp, UseBcast, UseIdx>),
nblks, nthrs, 0, thr_entry->stream,
ufeat_data, efeat_data, out_data, argu_data, arge_data,
row, col, edge_map,
N, M, E,
ubcast_off, ebcast_off,
lhs_len, rhs_len, len
);
lhs_len, rhs_len, len);
if (ReduceOp::require_arg) {
ArgSpMMCooKernel<Idx, DType, BinaryOp, ReduceOp, UseBcast, UseIdx>
<<<nblks, nthrs, 0, thr_entry->stream>>>(
CUDA_KERNEL_CALL((ArgSpMMCooKernel<Idx, DType, BinaryOp, ReduceOp, UseBcast, UseIdx>),
nblks, nthrs, 0, thr_entry->stream,
ufeat_data, efeat_data, out_data, argu_data, arge_data,
row, col, edge_map,
N, M, E,
ubcast_off, ebcast_off,
lhs_len, rhs_len, len
);
lhs_len, rhs_len, len);
}
});
}
......@@ -303,14 +302,13 @@ void SpMMCsr(
const bool use_idx = !IsNullArray(csr.data);
BCAST_IDX_CTX_SWITCH(bcast, use_idx, ufeat->ctx, ubcast_off, ebcast_off, {
SpMMCsrKernel<Idx, DType, BinaryOp, ReduceOp, UseBcast, UseIdx>
<<<nblks, nthrs, 0, thr_entry->stream>>>(
CUDA_KERNEL_CALL((SpMMCsrKernel<Idx, DType, BinaryOp, ReduceOp, UseBcast, UseIdx>),
nblks, nthrs, 0, thr_entry->stream,
ufeat_data, efeat_data, out_data, argu_data, arge_data,
indptr, indices, edge_map,
csr.num_rows, csr.num_cols, efeat->shape[0],
ubcast_off, ebcast_off,
lhs_len, rhs_len, len
);
lhs_len, rhs_len, len)
});
}
......
......@@ -108,7 +108,8 @@ void FarthestPointSampler(NDArray array, int64_t batch_size, int64_t sample_poin
// sample for each cloud in the batch
IdType* start_idx_data = static_cast<IdType*>(start_idx->data);
fps_kernel<<<batch_size, THREADS, 0, thr_entry->stream>>>(
CUDA_KERNEL_CALL(fps_kernel,
batch_size, THREADS, 0, thr_entry->stream,
array_data, batch_size, sample_points,
point_in_batch, dim, start_idx_data, dist_data, ret_data);
}
......
......@@ -25,7 +25,7 @@ void Fill(const DLContext& ctx, DType* ptr, size_t length, DType val) {
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
int nt = utils::FindNumThreads(length, 1024);
int nb = (length + nt - 1) / nt;
_FillKernel<<<nb, nt, 0, thr_entry->stream>>>(ptr, length, val);
CUDA_KERNEL_CALL(_FillKernel, nb, nt, 0, thr_entry->stream, ptr, length, val);
}
template void Fill<kDLGPU, float>(const DLContext& ctx, float* ptr, size_t length, float val);
......
......@@ -16,6 +16,16 @@
namespace dgl {
namespace runtime {
template <typename T>
inline bool is_zero(T size) {
return size == 0;
}
template <>
inline bool is_zero<dim3>(dim3 size) {
return size.x == 0 || size.y == 0 || size.z == 0;
}
#define CUDA_DRIVER_CALL(x) \
{ \
CUresult result = x; \
......@@ -34,6 +44,19 @@ namespace runtime {
<< "CUDA: " << cudaGetErrorString(e); \
}
#define CUDA_KERNEL_CALL(kernel, nblks, nthrs, shmem, stream, ...) \
{ \
if (!dgl::runtime::is_zero((nblks)) && \
!dgl::runtime::is_zero((nthrs))) { \
(kernel) <<< (nblks), (nthrs), (shmem), (stream) >>> \
(__VA_ARGS__); \
cudaError_t e = cudaGetLastError(); \
CHECK(e == cudaSuccess || e == cudaErrorCudartUnloading) \
<< "CUDA kernel launch error: " \
<< cudaGetErrorString(e); \
} \
}
#define CUSPARSE_CALL(func) \
{ \
cusparseStatus_t e = (func); \
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment