Unverified Commit b586cc2f authored by Jiazhen Wang's avatar Jiazhen Wang Committed by GitHub
Browse files

[Refactor] Replace DIVUP with GET_BLOCKS (#1586)

* [Improve] migrating DIVUP to GET_BLOCKS

* [Fix] use GET_BLOCKS only for block alloc and del useless statements

* [Fix] add kernel loop for nms and del useless statements
parent cf754db9
...@@ -19,7 +19,7 @@ void GroupPointsForwardCUDAKernelLauncher(int b, int c, int n, int npoints, ...@@ -19,7 +19,7 @@ void GroupPointsForwardCUDAKernelLauncher(int b, int c, int n, int npoints,
cudaStream_t stream = at::cuda::getCurrentCUDAStream(); cudaStream_t stream = at::cuda::getCurrentCUDAStream();
// blockIdx.x(col), blockIdx.y(row) // blockIdx.x(col), blockIdx.y(row)
dim3 blocks(DIVUP(npoints * nsample, THREADS_PER_BLOCK), c, b); dim3 blocks(GET_BLOCKS(npoints * nsample, THREADS_PER_BLOCK), c, b);
dim3 threads(THREADS_PER_BLOCK); dim3 threads(THREADS_PER_BLOCK);
AT_DISPATCH_FLOATING_TYPES_AND_HALF( AT_DISPATCH_FLOATING_TYPES_AND_HALF(
...@@ -46,7 +46,7 @@ void GroupPointsBackwardCUDAKernelLauncher(int b, int c, int n, int npoints, ...@@ -46,7 +46,7 @@ void GroupPointsBackwardCUDAKernelLauncher(int b, int c, int n, int npoints,
cudaStream_t stream = at::cuda::getCurrentCUDAStream(); cudaStream_t stream = at::cuda::getCurrentCUDAStream();
// blockIdx.x(col), blockIdx.y(row) // blockIdx.x(col), blockIdx.y(row)
dim3 blocks(DIVUP(npoints * nsample, THREADS_PER_BLOCK), c, b); dim3 blocks(GET_BLOCKS(npoints * nsample, THREADS_PER_BLOCK), c, b);
dim3 threads(THREADS_PER_BLOCK); dim3 threads(THREADS_PER_BLOCK);
AT_DISPATCH_FLOATING_TYPES_AND_HALF( AT_DISPATCH_FLOATING_TYPES_AND_HALF(
......
...@@ -21,8 +21,8 @@ void IoU3DBoxesOverlapBevForwardCUDAKernelLauncher(const int num_a, ...@@ -21,8 +21,8 @@ void IoU3DBoxesOverlapBevForwardCUDAKernelLauncher(const int num_a,
cudaStream_t stream = at::cuda::getCurrentCUDAStream(); cudaStream_t stream = at::cuda::getCurrentCUDAStream();
// blockIdx.x(col), blockIdx.y(row) // blockIdx.x(col), blockIdx.y(row)
dim3 blocks(DIVUP(num_b, THREADS_PER_BLOCK_IOU3D), dim3 blocks(GET_BLOCKS(num_b, THREADS_PER_BLOCK_IOU3D),
DIVUP(num_a, THREADS_PER_BLOCK_IOU3D)); GET_BLOCKS(num_a, THREADS_PER_BLOCK_IOU3D));
dim3 threads(THREADS_PER_BLOCK_IOU3D, THREADS_PER_BLOCK_IOU3D); dim3 threads(THREADS_PER_BLOCK_IOU3D, THREADS_PER_BLOCK_IOU3D);
iou3d_boxes_overlap_bev_forward_cuda_kernel<<<blocks, threads, 0, stream>>>( iou3d_boxes_overlap_bev_forward_cuda_kernel<<<blocks, threads, 0, stream>>>(
...@@ -41,8 +41,8 @@ void IoU3DBoxesIoUBevForwardCUDAKernelLauncher(const int num_a, ...@@ -41,8 +41,8 @@ void IoU3DBoxesIoUBevForwardCUDAKernelLauncher(const int num_a,
cudaStream_t stream = at::cuda::getCurrentCUDAStream(); cudaStream_t stream = at::cuda::getCurrentCUDAStream();
// blockIdx.x(col), blockIdx.y(row) // blockIdx.x(col), blockIdx.y(row)
dim3 blocks(DIVUP(num_b, THREADS_PER_BLOCK_IOU3D), dim3 blocks(GET_BLOCKS(num_b, THREADS_PER_BLOCK_IOU3D),
DIVUP(num_a, THREADS_PER_BLOCK_IOU3D)); GET_BLOCKS(num_a, THREADS_PER_BLOCK_IOU3D));
dim3 threads(THREADS_PER_BLOCK_IOU3D, THREADS_PER_BLOCK_IOU3D); dim3 threads(THREADS_PER_BLOCK_IOU3D, THREADS_PER_BLOCK_IOU3D);
iou3d_boxes_iou_bev_forward_cuda_kernel<<<blocks, threads, 0, stream>>>( iou3d_boxes_iou_bev_forward_cuda_kernel<<<blocks, threads, 0, stream>>>(
...@@ -58,8 +58,8 @@ void IoU3DNMSForwardCUDAKernelLauncher(const Tensor boxes, ...@@ -58,8 +58,8 @@ void IoU3DNMSForwardCUDAKernelLauncher(const Tensor boxes,
at::cuda::CUDAGuard device_guard(boxes.device()); at::cuda::CUDAGuard device_guard(boxes.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream(); cudaStream_t stream = at::cuda::getCurrentCUDAStream();
dim3 blocks(DIVUP(boxes_num, THREADS_PER_BLOCK_NMS), dim3 blocks(GET_BLOCKS(boxes_num, THREADS_PER_BLOCK_NMS),
DIVUP(boxes_num, THREADS_PER_BLOCK_NMS)); GET_BLOCKS(boxes_num, THREADS_PER_BLOCK_NMS));
dim3 threads(THREADS_PER_BLOCK_NMS); dim3 threads(THREADS_PER_BLOCK_NMS);
nms_forward_cuda_kernel<<<blocks, threads, 0, stream>>>( nms_forward_cuda_kernel<<<blocks, threads, 0, stream>>>(
...@@ -75,8 +75,8 @@ void IoU3DNMSNormalForwardCUDAKernelLauncher(const Tensor boxes, ...@@ -75,8 +75,8 @@ void IoU3DNMSNormalForwardCUDAKernelLauncher(const Tensor boxes,
at::cuda::CUDAGuard device_guard(boxes.device()); at::cuda::CUDAGuard device_guard(boxes.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream(); cudaStream_t stream = at::cuda::getCurrentCUDAStream();
dim3 blocks(DIVUP(boxes_num, THREADS_PER_BLOCK_NMS), dim3 blocks(GET_BLOCKS(boxes_num, THREADS_PER_BLOCK_NMS),
DIVUP(boxes_num, THREADS_PER_BLOCK_NMS)); GET_BLOCKS(boxes_num, THREADS_PER_BLOCK_NMS));
dim3 threads(THREADS_PER_BLOCK_NMS); dim3 threads(THREADS_PER_BLOCK_NMS);
nms_normal_forward_cuda_kernel<<<blocks, threads, 0, stream>>>( nms_normal_forward_cuda_kernel<<<blocks, threads, 0, stream>>>(
......
...@@ -19,7 +19,7 @@ void KNNForwardCUDAKernelLauncher(int b, int n, int m, int nsample, ...@@ -19,7 +19,7 @@ void KNNForwardCUDAKernelLauncher(int b, int n, int m, int nsample,
cudaStream_t stream = at::cuda::getCurrentCUDAStream(); cudaStream_t stream = at::cuda::getCurrentCUDAStream();
// blockIdx.x(col), blockIdx.y(row) // blockIdx.x(col), blockIdx.y(row)
dim3 blocks(DIVUP(m, THREADS_PER_BLOCK), b); dim3 blocks(GET_BLOCKS(m, THREADS_PER_BLOCK), b);
dim3 threads(THREADS_PER_BLOCK); dim3 threads(THREADS_PER_BLOCK);
AT_DISPATCH_FLOATING_TYPES_AND_HALF( AT_DISPATCH_FLOATING_TYPES_AND_HALF(
......
...@@ -13,10 +13,11 @@ Tensor NMSCUDAKernelLauncher(Tensor boxes, Tensor scores, float iou_threshold, ...@@ -13,10 +13,11 @@ Tensor NMSCUDAKernelLauncher(Tensor boxes, Tensor scores, float iou_threshold,
auto boxes_sorted = boxes.index_select(0, order_t); auto boxes_sorted = boxes.index_select(0, order_t);
int boxes_num = boxes.size(0); int boxes_num = boxes.size(0);
const int col_blocks = DIVUP(boxes_num, threadsPerBlock); const int col_blocks = (boxes_num + threadsPerBlock - 1) / threadsPerBlock;
const int col_blocks_alloc = GET_BLOCKS(boxes_num, threadsPerBlock);
Tensor mask = Tensor mask =
at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong)); at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong));
dim3 blocks(col_blocks, col_blocks); dim3 blocks(col_blocks_alloc, col_blocks_alloc);
dim3 threads(threadsPerBlock); dim3 threads(threadsPerBlock);
cudaStream_t stream = at::cuda::getCurrentCUDAStream(); cudaStream_t stream = at::cuda::getCurrentCUDAStream();
nms_cuda<<<blocks, threads, 0, stream>>>( nms_cuda<<<blocks, threads, 0, stream>>>(
......
...@@ -21,7 +21,7 @@ void PointsInBoxesPartForwardCUDAKernelLauncher(int batch_size, int boxes_num, ...@@ -21,7 +21,7 @@ void PointsInBoxesPartForwardCUDAKernelLauncher(int batch_size, int boxes_num,
at::cuda::CUDAGuard device_guard(boxes.device()); at::cuda::CUDAGuard device_guard(boxes.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream(); cudaStream_t stream = at::cuda::getCurrentCUDAStream();
dim3 blocks(DIVUP(pts_num, THREADS_PER_BLOCK), batch_size); dim3 blocks(GET_BLOCKS(pts_num, THREADS_PER_BLOCK), batch_size);
dim3 threads(THREADS_PER_BLOCK); dim3 threads(THREADS_PER_BLOCK);
AT_DISPATCH_FLOATING_TYPES_AND_HALF( AT_DISPATCH_FLOATING_TYPES_AND_HALF(
...@@ -47,7 +47,7 @@ void PointsInBoxesAllForwardCUDAKernelLauncher(int batch_size, int boxes_num, ...@@ -47,7 +47,7 @@ void PointsInBoxesAllForwardCUDAKernelLauncher(int batch_size, int boxes_num,
at::cuda::CUDAGuard device_guard(boxes.device()); at::cuda::CUDAGuard device_guard(boxes.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream(); cudaStream_t stream = at::cuda::getCurrentCUDAStream();
dim3 blocks(DIVUP(pts_num, THREADS_PER_BLOCK), batch_size); dim3 blocks(GET_BLOCKS(pts_num, THREADS_PER_BLOCK), batch_size);
dim3 threads(THREADS_PER_BLOCK); dim3 threads(THREADS_PER_BLOCK);
AT_DISPATCH_FLOATING_TYPES_AND_HALF( AT_DISPATCH_FLOATING_TYPES_AND_HALF(
......
...@@ -26,7 +26,7 @@ void RoiawarePool3dForwardCUDAKernelLauncher( ...@@ -26,7 +26,7 @@ void RoiawarePool3dForwardCUDAKernelLauncher(
Tensor pts_mask = Tensor pts_mask =
-at::ones({boxes_num, pts_num}, pts_feature.options().dtype(at::kInt)); -at::ones({boxes_num, pts_num}, pts_feature.options().dtype(at::kInt));
dim3 blocks_mask(DIVUP(pts_num, THREADS_PER_BLOCK), boxes_num); dim3 blocks_mask(GET_BLOCKS(pts_num, THREADS_PER_BLOCK), boxes_num);
dim3 threads(THREADS_PER_BLOCK); dim3 threads(THREADS_PER_BLOCK);
AT_DISPATCH_FLOATING_TYPES_AND_HALF( AT_DISPATCH_FLOATING_TYPES_AND_HALF(
...@@ -42,7 +42,7 @@ void RoiawarePool3dForwardCUDAKernelLauncher( ...@@ -42,7 +42,7 @@ void RoiawarePool3dForwardCUDAKernelLauncher(
// TODO: Merge the collect and pool functions, SS // TODO: Merge the collect and pool functions, SS
dim3 blocks_collect(DIVUP(boxes_num, THREADS_PER_BLOCK)); dim3 blocks_collect(GET_BLOCKS(boxes_num, THREADS_PER_BLOCK));
AT_DISPATCH_INTEGRAL_TYPES( AT_DISPATCH_INTEGRAL_TYPES(
pts_idx_of_voxels.scalar_type(), "collect_inside_pts_for_box3d", [&] { pts_idx_of_voxels.scalar_type(), "collect_inside_pts_for_box3d", [&] {
...@@ -55,8 +55,8 @@ void RoiawarePool3dForwardCUDAKernelLauncher( ...@@ -55,8 +55,8 @@ void RoiawarePool3dForwardCUDAKernelLauncher(
AT_CUDA_CHECK(cudaGetLastError()); AT_CUDA_CHECK(cudaGetLastError());
dim3 blocks_pool(DIVUP(out_x * out_y * out_z, THREADS_PER_BLOCK), channels, dim3 blocks_pool(GET_BLOCKS(out_x * out_y * out_z, THREADS_PER_BLOCK),
boxes_num); channels, boxes_num);
if (pool_method == 0) { if (pool_method == 0) {
AT_DISPATCH_FLOATING_TYPES_AND_HALF( AT_DISPATCH_FLOATING_TYPES_AND_HALF(
pts_feature.scalar_type(), "roiaware_maxpool3d", [&] { pts_feature.scalar_type(), "roiaware_maxpool3d", [&] {
...@@ -93,7 +93,7 @@ void RoiawarePool3dBackwardCUDAKernelLauncher( ...@@ -93,7 +93,7 @@ void RoiawarePool3dBackwardCUDAKernelLauncher(
at::cuda::CUDAGuard device_guard(grad_out.device()); at::cuda::CUDAGuard device_guard(grad_out.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream(); cudaStream_t stream = at::cuda::getCurrentCUDAStream();
dim3 blocks(DIVUP(out_x * out_y * out_z, THREADS_PER_BLOCK), channels, dim3 blocks(GET_BLOCKS(out_x * out_y * out_z, THREADS_PER_BLOCK), channels,
boxes_num); boxes_num);
dim3 threads(THREADS_PER_BLOCK); dim3 threads(THREADS_PER_BLOCK);
......
...@@ -24,7 +24,7 @@ void RoIPointPool3dForwardCUDAKernelLauncher( ...@@ -24,7 +24,7 @@ void RoIPointPool3dForwardCUDAKernelLauncher(
cudaStream_t stream = at::cuda::getCurrentCUDAStream(); cudaStream_t stream = at::cuda::getCurrentCUDAStream();
// blockIdx.x(col), blockIdx.y(row) // blockIdx.x(col), blockIdx.y(row)
dim3 blocks(DIVUP(pts_num, THREADS_PER_BLOCK), boxes_num, batch_size); dim3 blocks(GET_BLOCKS(pts_num, THREADS_PER_BLOCK), boxes_num, batch_size);
dim3 threads(THREADS_PER_BLOCK); dim3 threads(THREADS_PER_BLOCK);
AT_DISPATCH_FLOATING_TYPES_AND_HALF( AT_DISPATCH_FLOATING_TYPES_AND_HALF(
...@@ -38,14 +38,14 @@ void RoIPointPool3dForwardCUDAKernelLauncher( ...@@ -38,14 +38,14 @@ void RoIPointPool3dForwardCUDAKernelLauncher(
boxes3d.options().dtype(at::kInt)); boxes3d.options().dtype(at::kInt));
// blockIdx.x(col), blockIdx.y(row) // blockIdx.x(col), blockIdx.y(row)
dim3 blocks2(DIVUP(boxes_num, THREADS_PER_BLOCK), batch_size); dim3 blocks2(GET_BLOCKS(boxes_num, THREADS_PER_BLOCK), batch_size);
get_pooled_idx<<<blocks2, threads, 0, stream>>>( get_pooled_idx<<<blocks2, threads, 0, stream>>>(
batch_size, pts_num, boxes_num, sampled_pts_num, batch_size, pts_num, boxes_num, sampled_pts_num,
pts_assign.data_ptr<int>(), pts_idx.data_ptr<int>(), pts_assign.data_ptr<int>(), pts_idx.data_ptr<int>(),
pooled_empty_flag.data_ptr<int>()); pooled_empty_flag.data_ptr<int>());
dim3 blocks_pool(DIVUP(sampled_pts_num, THREADS_PER_BLOCK), boxes_num, dim3 blocks_pool(GET_BLOCKS(sampled_pts_num, THREADS_PER_BLOCK), boxes_num,
batch_size); batch_size);
AT_DISPATCH_FLOATING_TYPES_AND_HALF( AT_DISPATCH_FLOATING_TYPES_AND_HALF(
......
...@@ -23,7 +23,7 @@ void ThreeInterpolateForwardCUDAKernelLauncher(int b, int c, int m, int n, ...@@ -23,7 +23,7 @@ void ThreeInterpolateForwardCUDAKernelLauncher(int b, int c, int m, int n,
cudaStream_t stream = at::cuda::getCurrentCUDAStream(); cudaStream_t stream = at::cuda::getCurrentCUDAStream();
// blockIdx.x(col), blockIdx.y(row) // blockIdx.x(col), blockIdx.y(row)
dim3 blocks(DIVUP(n, THREADS_PER_BLOCK), c, b); dim3 blocks(GET_BLOCKS(n, THREADS_PER_BLOCK), c, b);
dim3 threads(THREADS_PER_BLOCK); dim3 threads(THREADS_PER_BLOCK);
AT_DISPATCH_FLOATING_TYPES_AND_HALF( AT_DISPATCH_FLOATING_TYPES_AND_HALF(
...@@ -51,7 +51,7 @@ void ThreeInterpolateBackwardCUDAKernelLauncher(int b, int c, int n, int m, ...@@ -51,7 +51,7 @@ void ThreeInterpolateBackwardCUDAKernelLauncher(int b, int c, int n, int m,
cudaStream_t stream = at::cuda::getCurrentCUDAStream(); cudaStream_t stream = at::cuda::getCurrentCUDAStream();
// blockIdx.x(col), blockIdx.y(row) // blockIdx.x(col), blockIdx.y(row)
dim3 blocks(DIVUP(n, THREADS_PER_BLOCK), c, b); dim3 blocks(GET_BLOCKS(n, THREADS_PER_BLOCK), c, b);
dim3 threads(THREADS_PER_BLOCK); dim3 threads(THREADS_PER_BLOCK);
AT_DISPATCH_FLOATING_TYPES_AND_HALF( AT_DISPATCH_FLOATING_TYPES_AND_HALF(
......
...@@ -21,7 +21,7 @@ void ThreeNNForwardCUDAKernelLauncher(int b, int n, int m, const Tensor unknown, ...@@ -21,7 +21,7 @@ void ThreeNNForwardCUDAKernelLauncher(int b, int n, int m, const Tensor unknown,
cudaStream_t stream = at::cuda::getCurrentCUDAStream(); cudaStream_t stream = at::cuda::getCurrentCUDAStream();
// blockIdx.x(col), blockIdx.y(row) // blockIdx.x(col), blockIdx.y(row)
dim3 blocks(DIVUP(n, THREADS_PER_BLOCK), b); dim3 blocks(GET_BLOCKS(n, THREADS_PER_BLOCK), b);
dim3 threads(THREADS_PER_BLOCK); dim3 threads(THREADS_PER_BLOCK);
AT_DISPATCH_FLOATING_TYPES_AND_HALF( AT_DISPATCH_FLOATING_TYPES_AND_HALF(
......
...@@ -73,7 +73,8 @@ void iou3d_nms_forward(Tensor boxes, Tensor keep, Tensor keep_num, ...@@ -73,7 +73,8 @@ void iou3d_nms_forward(Tensor boxes, Tensor keep, Tensor keep_num,
int64_t *keep_data = keep.data_ptr<int64_t>(); int64_t *keep_data = keep.data_ptr<int64_t>();
int64_t *keep_num_data = keep_num.data_ptr<int64_t>(); int64_t *keep_num_data = keep_num.data_ptr<int64_t>();
const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS); const int col_blocks =
(boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS;
Tensor mask = Tensor mask =
at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong)); at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong));
...@@ -117,7 +118,8 @@ void iou3d_nms_normal_forward(Tensor boxes, Tensor keep, Tensor keep_num, ...@@ -117,7 +118,8 @@ void iou3d_nms_normal_forward(Tensor boxes, Tensor keep, Tensor keep_num,
int64_t *keep_data = keep.data_ptr<int64_t>(); int64_t *keep_data = keep.data_ptr<int64_t>();
int64_t *keep_num_data = keep_num.data_ptr<int64_t>(); int64_t *keep_num_data = keep_num.data_ptr<int64_t>();
const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS); const int col_blocks =
(boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS;
Tensor mask = Tensor mask =
at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong)); at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong));
......
...@@ -85,7 +85,7 @@ void CornerPoolForwardLauncher(const scalar_t *input, scalar_t *output, ...@@ -85,7 +85,7 @@ void CornerPoolForwardLauncher(const scalar_t *input, scalar_t *output,
case 0: case 0:
case 1: case 1:
nthreads = batch_size * channels * width; nthreads = batch_size * channels * width;
col_block = DIVUP(nthreads, THREADS_PER_BLOCK); col_block = GET_BLOCKS(nthreads, THREADS_PER_BLOCK);
top_bottom_pool_kernel<scalar_t> top_bottom_pool_kernel<scalar_t>
<<<col_block, THREADS_PER_BLOCK, 0, stream>>>( <<<col_block, THREADS_PER_BLOCK, 0, stream>>>(
input, output, batch_size, channels, height, width, pool_type); input, output, batch_size, channels, height, width, pool_type);
...@@ -93,7 +93,7 @@ void CornerPoolForwardLauncher(const scalar_t *input, scalar_t *output, ...@@ -93,7 +93,7 @@ void CornerPoolForwardLauncher(const scalar_t *input, scalar_t *output,
case 2: case 2:
case 3: case 3:
nthreads = batch_size * channels * height; nthreads = batch_size * channels * height;
col_block = DIVUP(nthreads, THREADS_PER_BLOCK); col_block = GET_BLOCKS(nthreads, THREADS_PER_BLOCK);
left_right_pool_kernel<scalar_t> left_right_pool_kernel<scalar_t>
<<<col_block, THREADS_PER_BLOCK, 0, stream>>>( <<<col_block, THREADS_PER_BLOCK, 0, stream>>>(
input, output, batch_size, channels, height, width, pool_type); input, output, batch_size, channels, height, width, pool_type);
......
...@@ -67,7 +67,7 @@ void CumMaxMinForwardLauncher(const scalar_t *input, scalar_t *output_value, ...@@ -67,7 +67,7 @@ void CumMaxMinForwardLauncher(const scalar_t *input, scalar_t *output_value,
const int data_size = const int data_size =
tensor_desc.stride[0] * tensor_desc.shape[0] / tensor_desc.shape[cum_dim]; tensor_desc.stride[0] * tensor_desc.shape[0] / tensor_desc.shape[cum_dim];
const int col_block = DIVUP(data_size, THREADS_PER_BLOCK); const int col_block = GET_BLOCKS(data_size, THREADS_PER_BLOCK);
cummaxmin_kernel<scalar_t><<<col_block, THREADS_PER_BLOCK, 0, stream>>>( cummaxmin_kernel<scalar_t><<<col_block, THREADS_PER_BLOCK, 0, stream>>>(
input, output_value, output_index, tensor_desc, cum_dim, cum_type); input, output_value, output_index, tensor_desc, cum_dim, cum_type);
......
...@@ -114,7 +114,8 @@ size_t get_onnxnms_workspace_size(size_t num_batches, size_t spatial_dimension, ...@@ -114,7 +114,8 @@ size_t get_onnxnms_workspace_size(size_t num_batches, size_t spatial_dimension,
mmcv::getAlignedSize(spatial_dimension * boxes_word_size); mmcv::getAlignedSize(spatial_dimension * boxes_word_size);
size_t boxes_workspace = size_t boxes_workspace =
mmcv::getAlignedSize(spatial_dimension * 4 * boxes_word_size); mmcv::getAlignedSize(spatial_dimension * 4 * boxes_word_size);
const int col_blocks = DIVUP(spatial_dimension, threadsPerBlock); const int col_blocks =
(spatial_dimension + threadsPerBlock - 1) / threadsPerBlock;
size_t mask_workspace = mmcv::getAlignedSize(spatial_dimension * col_blocks * size_t mask_workspace = mmcv::getAlignedSize(spatial_dimension * col_blocks *
sizeof(unsigned long long)); sizeof(unsigned long long));
size_t index_template_workspace = size_t index_template_workspace =
...@@ -163,7 +164,8 @@ void TRTNMSCUDAKernelLauncher_float(const float* boxes, const float* scores, ...@@ -163,7 +164,8 @@ void TRTNMSCUDAKernelLauncher_float(const float* boxes, const float* scores,
int spatial_dimension, int num_classes, int spatial_dimension, int num_classes,
size_t output_length, void* workspace, size_t output_length, void* workspace,
cudaStream_t stream) { cudaStream_t stream) {
const int col_blocks = DIVUP(spatial_dimension, threadsPerBlock); const int col_blocks =
(spatial_dimension + threadsPerBlock - 1) / threadsPerBlock;
float* boxes_sorted = (float*)workspace; float* boxes_sorted = (float*)workspace;
workspace = static_cast<char*>(workspace) + workspace = static_cast<char*>(workspace) +
mmcv::getAlignedSize(spatial_dimension * 4 * sizeof(float)); mmcv::getAlignedSize(spatial_dimension * 4 * sizeof(float));
......
...@@ -67,7 +67,7 @@ void TRTONNXScatterNDKernelLauncher(const T* data, const int* indices, ...@@ -67,7 +67,7 @@ void TRTONNXScatterNDKernelLauncher(const T* data, const int* indices,
num_update_indice *= indice_desc.shape[i]; num_update_indice *= indice_desc.shape[i];
} }
// scatter // scatter
const int col_block = DIVUP(num_update_indice, threadsPerBlock); const int col_block = GET_BLOCKS(num_update_indice, threadsPerBlock);
onnx_scatternd_kernel<<<col_block, threadsPerBlock, 0, stream>>>( onnx_scatternd_kernel<<<col_block, threadsPerBlock, 0, stream>>>(
num_update_indice, indices, update, output, tensor_desc, indice_desc); num_update_indice, indices, update, output, tensor_desc, indice_desc);
} }
......
...@@ -3,8 +3,6 @@ ...@@ -3,8 +3,6 @@
#define TRT_CUDA_HELPER_HPP #define TRT_CUDA_HELPER_HPP
#include <cublas_v2.h> #include <cublas_v2.h>
#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
#define cudaCheckError() \ #define cudaCheckError() \
{ \ { \
cudaError_t e = cudaGetLastError(); \ cudaError_t e = cudaGetLastError(); \
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment