Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
MMCV
Commits
fdeee889
Commit
fdeee889
authored
May 25, 2025
by
limm
Browse files
release v1.6.1 of mmcv
parent
df465820
Changes
457
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1010 additions
and
106 deletions
+1010
-106
mmcv/ops/csrc/pytorch/cuda/gather_points_cuda.cu
mmcv/ops/csrc/pytorch/cuda/gather_points_cuda.cu
+2
-2
mmcv/ops/csrc/pytorch/cuda/group_points_cuda.cu
mmcv/ops/csrc/pytorch/cuda/group_points_cuda.cu
+2
-2
mmcv/ops/csrc/pytorch/cuda/iou3d_cuda.cu
mmcv/ops/csrc/pytorch/cuda/iou3d_cuda.cu
+16
-35
mmcv/ops/csrc/pytorch/cuda/knn_cuda.cu
mmcv/ops/csrc/pytorch/cuda/knn_cuda.cu
+1
-1
mmcv/ops/csrc/pytorch/cuda/min_area_polygons.cu
mmcv/ops/csrc/pytorch/cuda/min_area_polygons.cu
+21
-0
mmcv/ops/csrc/pytorch/cuda/ms_deform_attn_cuda.cu
mmcv/ops/csrc/pytorch/cuda/ms_deform_attn_cuda.cu
+4
-14
mmcv/ops/csrc/pytorch/cuda/nms_cuda.cu
mmcv/ops/csrc/pytorch/cuda/nms_cuda.cu
+11
-28
mmcv/ops/csrc/pytorch/cuda/points_in_boxes_cuda.cu
mmcv/ops/csrc/pytorch/cuda/points_in_boxes_cuda.cu
+2
-2
mmcv/ops/csrc/pytorch/cuda/points_in_polygons_cuda.cu
mmcv/ops/csrc/pytorch/cuda/points_in_polygons_cuda.cu
+28
-0
mmcv/ops/csrc/pytorch/cuda/prroi_pool_cuda.cu
mmcv/ops/csrc/pytorch/cuda/prroi_pool_cuda.cu
+65
-0
mmcv/ops/csrc/pytorch/cuda/psamask_cuda.cu
mmcv/ops/csrc/pytorch/cuda/psamask_cuda.cu
+0
-3
mmcv/ops/csrc/pytorch/cuda/riroi_align_rotated_cuda.cu
mmcv/ops/csrc/pytorch/cuda/riroi_align_rotated_cuda.cu
+53
-0
mmcv/ops/csrc/pytorch/cuda/roi_align_rotated_cuda.cu
mmcv/ops/csrc/pytorch/cuda/roi_align_rotated_cuda.cu
+7
-7
mmcv/ops/csrc/pytorch/cuda/roiaware_pool3d_cuda.cu
mmcv/ops/csrc/pytorch/cuda/roiaware_pool3d_cuda.cu
+5
-5
mmcv/ops/csrc/pytorch/cuda/roipoint_pool3d_cuda.cu
mmcv/ops/csrc/pytorch/cuda/roipoint_pool3d_cuda.cu
+3
-3
mmcv/ops/csrc/pytorch/cuda/rotated_feature_align_cuda.cu
mmcv/ops/csrc/pytorch/cuda/rotated_feature_align_cuda.cu
+53
-0
mmcv/ops/csrc/pytorch/cuda/scatter_points_cuda.cu
mmcv/ops/csrc/pytorch/cuda/scatter_points_cuda.cu
+9
-4
mmcv/ops/csrc/pytorch/cuda/sparse_indice.cu
mmcv/ops/csrc/pytorch/cuda/sparse_indice.cu
+156
-0
mmcv/ops/csrc/pytorch/cuda/sparse_maxpool.cu
mmcv/ops/csrc/pytorch/cuda/sparse_maxpool.cu
+483
-0
mmcv/ops/csrc/pytorch/cuda/sparse_pool_ops_cuda.cu
mmcv/ops/csrc/pytorch/cuda/sparse_pool_ops_cuda.cu
+89
-0
No files found.
Too many changes to show.
To preserve performance only
457 of 457+
files are displayed.
Plain diff
Email patch
mmcv/ops/csrc/pytorch/cuda/gather_points_cuda.cu
View file @
fdeee889
...
...
@@ -16,7 +16,7 @@ void GatherPointsForwardCUDAKernelLauncher(int b, int c, int n, int npoints,
cudaStream_t
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
// blockIdx.x(col), blockIdx.y(row)
dim3
blocks
(
DIVUP
(
npoints
,
THREADS_PER_BLOCK
),
c
,
b
);
dim3
blocks
(
GET_BLOCKS
(
npoints
,
THREADS_PER_BLOCK
),
c
,
b
);
dim3
threads
(
THREADS_PER_BLOCK
);
AT_DISPATCH_FLOATING_TYPES_AND_HALF
(
...
...
@@ -43,7 +43,7 @@ void GatherPointsBackwardCUDAKernelLauncher(int b, int c, int n, int npoints,
cudaStream_t
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
// blockIdx.x(col), blockIdx.y(row)
dim3
blocks
(
DIVUP
(
npoints
,
THREADS_PER_BLOCK
),
c
,
b
);
dim3
blocks
(
GET_BLOCKS
(
npoints
,
THREADS_PER_BLOCK
),
c
,
b
);
dim3
threads
(
THREADS_PER_BLOCK
);
AT_DISPATCH_FLOATING_TYPES_AND_HALF
(
...
...
mmcv/ops/csrc/pytorch/cuda/group_points_cuda.cu
View file @
fdeee889
...
...
@@ -19,7 +19,7 @@ void GroupPointsForwardCUDAKernelLauncher(int b, int c, int n, int npoints,
cudaStream_t
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
// blockIdx.x(col), blockIdx.y(row)
dim3
blocks
(
DIVUP
(
npoints
*
nsample
,
THREADS_PER_BLOCK
),
c
,
b
);
dim3
blocks
(
GET_BLOCKS
(
npoints
*
nsample
,
THREADS_PER_BLOCK
),
c
,
b
);
dim3
threads
(
THREADS_PER_BLOCK
);
AT_DISPATCH_FLOATING_TYPES_AND_HALF
(
...
...
@@ -46,7 +46,7 @@ void GroupPointsBackwardCUDAKernelLauncher(int b, int c, int n, int npoints,
cudaStream_t
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
// blockIdx.x(col), blockIdx.y(row)
dim3
blocks
(
DIVUP
(
npoints
*
nsample
,
THREADS_PER_BLOCK
),
c
,
b
);
dim3
blocks
(
GET_BLOCKS
(
npoints
*
nsample
,
THREADS_PER_BLOCK
),
c
,
b
);
dim3
threads
(
THREADS_PER_BLOCK
);
AT_DISPATCH_FLOATING_TYPES_AND_HALF
(
...
...
mmcv/ops/csrc/pytorch/cuda/iou3d_cuda.cu
View file @
fdeee889
...
...
@@ -21,8 +21,8 @@ void IoU3DBoxesOverlapBevForwardCUDAKernelLauncher(const int num_a,
cudaStream_t
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
// blockIdx.x(col), blockIdx.y(row)
dim3
blocks
(
DIVUP
(
num_b
,
THREADS_PER_BLOCK_IOU3D
),
DIVUP
(
num_a
,
THREADS_PER_BLOCK_IOU3D
));
dim3
blocks
(
GET_BLOCKS
(
num_b
,
THREADS_PER_BLOCK_IOU3D
),
GET_BLOCKS
(
num_a
,
THREADS_PER_BLOCK_IOU3D
));
dim3
threads
(
THREADS_PER_BLOCK_IOU3D
,
THREADS_PER_BLOCK_IOU3D
);
iou3d_boxes_overlap_bev_forward_cuda_kernel
<<<
blocks
,
threads
,
0
,
stream
>>>
(
...
...
@@ -32,54 +32,35 @@ void IoU3DBoxesOverlapBevForwardCUDAKernelLauncher(const int num_a,
AT_CUDA_CHECK
(
cudaGetLastError
());
}
void
IoU3DBoxesIoUBevForwardCUDAKernelLauncher
(
const
int
num_a
,
const
Tensor
boxes_a
,
const
int
num_b
,
const
Tensor
boxes_b
,
Tensor
ans_iou
)
{
at
::
cuda
::
CUDAGuard
device_guard
(
boxes_a
.
device
());
cudaStream_t
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
// blockIdx.x(col), blockIdx.y(row)
dim3
blocks
(
DIVUP
(
num_b
,
THREADS_PER_BLOCK_IOU3D
),
DIVUP
(
num_a
,
THREADS_PER_BLOCK_IOU3D
));
dim3
threads
(
THREADS_PER_BLOCK_IOU3D
,
THREADS_PER_BLOCK_IOU3D
);
iou3d_boxes_iou_bev_forward_cuda_kernel
<<<
blocks
,
threads
,
0
,
stream
>>>
(
num_a
,
boxes_a
.
data_ptr
<
float
>
(),
num_b
,
boxes_b
.
data_ptr
<
float
>
(),
ans_iou
.
data_ptr
<
float
>
());
AT_CUDA_CHECK
(
cudaGetLastError
());
}
void
IoU3DNMSForwardCUDAKernelLauncher
(
const
Tensor
boxes
,
unsigned
long
long
*
mask
,
int
boxes_num
,
float
nms_overlap_thresh
)
{
void
IoU3DNMS3DForwardCUDAKernelLauncher
(
const
Tensor
boxes
,
unsigned
long
long
*
mask
,
int
boxes_num
,
float
nms_overlap_thresh
)
{
at
::
cuda
::
CUDAGuard
device_guard
(
boxes
.
device
());
cudaStream_t
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
dim3
blocks
(
DIVUP
(
boxes_num
,
THREADS_PER_BLOCK_NMS
),
DIVUP
(
boxes_num
,
THREADS_PER_BLOCK_NMS
));
dim3
blocks
(
GET_BLOCKS
(
boxes_num
,
THREADS_PER_BLOCK_NMS
),
GET_BLOCKS
(
boxes_num
,
THREADS_PER_BLOCK_NMS
));
dim3
threads
(
THREADS_PER_BLOCK_NMS
);
nms_forward_cuda_kernel
<<<
blocks
,
threads
,
0
,
stream
>>>
(
iou3d_
nms
3d
_forward_cuda_kernel
<<<
blocks
,
threads
,
0
,
stream
>>>
(
boxes_num
,
nms_overlap_thresh
,
boxes
.
data_ptr
<
float
>
(),
mask
);
AT_CUDA_CHECK
(
cudaGetLastError
());
}
void
IoU3DNMSNormalForwardCUDAKernelLauncher
(
const
Tensor
boxes
,
unsigned
long
long
*
mask
,
int
boxes_num
,
float
nms_overlap_thresh
)
{
void
IoU3DNMS
3D
NormalForwardCUDAKernelLauncher
(
const
Tensor
boxes
,
unsigned
long
long
*
mask
,
int
boxes_num
,
float
nms_overlap_thresh
)
{
at
::
cuda
::
CUDAGuard
device_guard
(
boxes
.
device
());
cudaStream_t
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
dim3
blocks
(
DIVUP
(
boxes_num
,
THREADS_PER_BLOCK_NMS
),
DIVUP
(
boxes_num
,
THREADS_PER_BLOCK_NMS
));
dim3
blocks
(
GET_BLOCKS
(
boxes_num
,
THREADS_PER_BLOCK_NMS
),
GET_BLOCKS
(
boxes_num
,
THREADS_PER_BLOCK_NMS
));
dim3
threads
(
THREADS_PER_BLOCK_NMS
);
nms_normal_forward_cuda_kernel
<<<
blocks
,
threads
,
0
,
stream
>>>
(
iou3d_
nms
3d
_normal_forward_cuda_kernel
<<<
blocks
,
threads
,
0
,
stream
>>>
(
boxes_num
,
nms_overlap_thresh
,
boxes
.
data_ptr
<
float
>
(),
mask
);
AT_CUDA_CHECK
(
cudaGetLastError
());
...
...
mmcv/ops/csrc/pytorch/cuda/knn_cuda.cu
View file @
fdeee889
...
...
@@ -19,7 +19,7 @@ void KNNForwardCUDAKernelLauncher(int b, int n, int m, int nsample,
cudaStream_t
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
// blockIdx.x(col), blockIdx.y(row)
dim3
blocks
(
DIVUP
(
m
,
THREADS_PER_BLOCK
),
b
);
dim3
blocks
(
GET_BLOCKS
(
m
,
THREADS_PER_BLOCK
),
b
);
dim3
threads
(
THREADS_PER_BLOCK
);
AT_DISPATCH_FLOATING_TYPES_AND_HALF
(
...
...
mmcv/ops/csrc/pytorch/cuda/min_area_polygons.cu
0 → 100644
View file @
fdeee889
// Copyright (c) OpenMMLab. All rights reserved
// modified from
// https://github.com/SDL-GuoZonghao/BeyondBoundingBox/blob/main/mmdet/ops/minareabbox/src/minareabbox_kernel.cu
#include "min_area_polygons_cuda.cuh"
#include "pytorch_cuda_helper.hpp"
void
MinAreaPolygonsCUDAKernelLauncher
(
const
Tensor
pointsets
,
Tensor
polygons
)
{
int
num_pointsets
=
pointsets
.
size
(
0
);
const
int
output_size
=
polygons
.
numel
();
at
::
cuda
::
CUDAGuard
device_guard
(
pointsets
.
device
());
cudaStream_t
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
AT_DISPATCH_FLOATING_TYPES_AND_HALF
(
pointsets
.
scalar_type
(),
"min_area_polygons_cuda_kernel"
,
([
&
]
{
min_area_polygons_cuda_kernel
<
scalar_t
>
<<<
GET_BLOCKS
(
output_size
),
THREADS_PER_BLOCK
,
0
,
stream
>>>
(
num_pointsets
,
pointsets
.
data_ptr
<
scalar_t
>
(),
polygons
.
data_ptr
<
scalar_t
>
());
}));
AT_CUDA_CHECK
(
cudaGetLastError
());
}
mmcv/ops/csrc/pytorch/cuda/ms_deform_attn_cuda.cu
View file @
fdeee889
...
...
@@ -31,7 +31,7 @@ void ms_deformable_im2col_cuda(cudaStream_t stream, const scalar_t *data_value,
const
int
num_point
,
scalar_t
*
data_col
)
{
const
int
num_kernels
=
batch_size
*
num_query
*
num_heads
*
channels
;
const
int
num_actual_kernels
=
batch_size
*
num_query
*
num_heads
*
channels
;
const
int
num_threads
=
CUDA_NUM_THREADS
;
const
int
num_threads
=
THREADS_PER_BLOCK
;
ms_deformable_im2col_gpu_kernel
<
scalar_t
>
<<<
GET_BLOCKS
(
num_actual_kernels
,
num_threads
),
num_threads
,
0
,
stream
>>>
(
num_kernels
,
data_value
,
data_spatial_shapes
,
data_level_start_index
,
...
...
@@ -54,11 +54,11 @@ void ms_deformable_col2im_cuda(
const
int
num_point
,
scalar_t
*
grad_value
,
scalar_t
*
grad_sampling_loc
,
scalar_t
*
grad_attn_weight
)
{
const
int
num_threads
=
(
channels
>
CUDA_NUM_THREADS
)
?
CUDA_NUM_THREADS
:
channels
;
(
channels
>
THREADS_PER_BLOCK
)
?
THREADS_PER_BLOCK
:
channels
;
const
int
num_kernels
=
batch_size
*
num_query
*
num_heads
*
channels
;
const
int
num_actual_kernels
=
batch_size
*
num_query
*
num_heads
*
channels
;
if
(
channels
>
1024
)
{
if
((
channels
&
1023
)
==
0
)
{
if
(
channels
>
THREADS_PER_BLOCK
)
{
if
((
channels
&
THREADS_PER_BLOCK
-
1
)
==
0
)
{
ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks
<
scalar_t
>
<<<
GET_BLOCKS
(
num_actual_kernels
,
num_threads
),
num_threads
,
num_threads
*
3
*
sizeof
(
scalar_t
),
stream
>>>
(
...
...
@@ -178,16 +178,6 @@ void ms_deformable_col2im_cuda(
channels
,
num_levels
,
num_query
,
num_point
,
grad_value
,
grad_sampling_loc
,
grad_attn_weight
);
break
;
case
1024
:
ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2
<
scalar_t
,
1024
>
<<<
GET_BLOCKS
(
num_actual_kernels
,
num_threads
),
num_threads
,
0
,
stream
>>>
(
num_kernels
,
grad_col
,
data_value
,
data_spatial_shapes
,
data_level_start_index
,
data_sampling_loc
,
data_attn_weight
,
batch_size
,
spatial_size
,
num_heads
,
channels
,
num_levels
,
num_query
,
num_point
,
grad_value
,
grad_sampling_loc
,
grad_attn_weight
);
break
;
default:
if
(
channels
<
64
)
{
ms_deformable_col2im_gpu_kernel_shm_reduce_v1
<
scalar_t
>
...
...
mmcv/ops/csrc/pytorch/cuda/nms_cuda.cu
View file @
fdeee889
...
...
@@ -13,41 +13,24 @@ Tensor NMSCUDAKernelLauncher(Tensor boxes, Tensor scores, float iou_threshold,
auto
boxes_sorted
=
boxes
.
index_select
(
0
,
order_t
);
int
boxes_num
=
boxes
.
size
(
0
);
const
int
col_blocks
=
DIVUP
(
boxes_num
,
threadsPerBlock
);
const
int
col_blocks
=
(
boxes_num
+
threadsPerBlock
-
1
)
/
threadsPerBlock
;
const
int
col_blocks_alloc
=
GET_BLOCKS
(
boxes_num
,
threadsPerBlock
);
Tensor
mask
=
at
::
empty
({
boxes_num
,
col_blocks
},
boxes
.
options
().
dtype
(
at
::
kLong
));
dim3
blocks
(
col_blocks
,
col_blocks
);
dim3
blocks
(
col_blocks
_alloc
,
col_blocks
_alloc
);
dim3
threads
(
threadsPerBlock
);
cudaStream_t
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
nms_cuda
<<<
blocks
,
threads
,
0
,
stream
>>>
(
boxes_num
,
iou_threshold
,
offset
,
boxes_sorted
.
data_ptr
<
float
>
(),
(
unsigned
long
long
*
)
mask
.
data_ptr
<
int64_t
>
());
at
::
Tensor
mask_cpu
=
mask
.
to
(
at
::
kCPU
);
unsigned
long
long
*
mask_host
=
(
unsigned
long
long
*
)
mask_cpu
.
data_ptr
<
int64_t
>
();
std
::
vector
<
unsigned
long
long
>
remv
(
col_blocks
);
memset
(
&
remv
[
0
],
0
,
sizeof
(
unsigned
long
long
)
*
col_blocks
);
at
::
Tensor
keep_t
=
at
::
zeros
({
boxes_num
},
boxes
.
options
().
dtype
(
at
::
kBool
).
device
(
at
::
kCPU
));
bool
*
keep
=
keep_t
.
data_ptr
<
bool
>
();
for
(
int
i
=
0
;
i
<
boxes_num
;
i
++
)
{
int
nblock
=
i
/
threadsPerBlock
;
int
inblock
=
i
%
threadsPerBlock
;
if
(
!
(
remv
[
nblock
]
&
(
1ULL
<<
inblock
)))
{
keep
[
i
]
=
true
;
// set every overlap box with bit 1 in remv
unsigned
long
long
*
p
=
mask_host
+
i
*
col_blocks
;
for
(
int
j
=
nblock
;
j
<
col_blocks
;
j
++
)
{
remv
[
j
]
|=
p
[
j
];
}
}
}
// Filter the boxes which should be kept.
at
::
Tensor
keep_t
=
at
::
zeros
(
{
boxes_num
},
boxes
.
options
().
dtype
(
at
::
kBool
).
device
(
at
::
kCUDA
));
gather_keep_from_mask
<<<
1
,
min
(
col_blocks
,
THREADS_PER_BLOCK
),
col_blocks
*
sizeof
(
unsigned
long
long
),
stream
>>>
(
keep_t
.
data_ptr
<
bool
>
(),
(
unsigned
long
long
*
)
mask
.
data_ptr
<
int64_t
>
(),
boxes_num
);
AT_CUDA_CHECK
(
cudaGetLastError
());
return
order_t
.
masked_select
(
keep_t
.
to
(
at
::
kCUDA
)
);
return
order_t
.
masked_select
(
keep_t
);
}
mmcv/ops/csrc/pytorch/cuda/points_in_boxes_cuda.cu
View file @
fdeee889
...
...
@@ -21,7 +21,7 @@ void PointsInBoxesPartForwardCUDAKernelLauncher(int batch_size, int boxes_num,
at
::
cuda
::
CUDAGuard
device_guard
(
boxes
.
device
());
cudaStream_t
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
dim3
blocks
(
DIVUP
(
pts_num
,
THREADS_PER_BLOCK
),
batch_size
);
dim3
blocks
(
GET_BLOCKS
(
pts_num
,
THREADS_PER_BLOCK
),
batch_size
);
dim3
threads
(
THREADS_PER_BLOCK
);
AT_DISPATCH_FLOATING_TYPES_AND_HALF
(
...
...
@@ -47,7 +47,7 @@ void PointsInBoxesAllForwardCUDAKernelLauncher(int batch_size, int boxes_num,
at
::
cuda
::
CUDAGuard
device_guard
(
boxes
.
device
());
cudaStream_t
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
dim3
blocks
(
DIVUP
(
pts_num
,
THREADS_PER_BLOCK
),
batch_size
);
dim3
blocks
(
GET_BLOCKS
(
pts_num
,
THREADS_PER_BLOCK
),
batch_size
);
dim3
threads
(
THREADS_PER_BLOCK
);
AT_DISPATCH_FLOATING_TYPES_AND_HALF
(
...
...
mmcv/ops/csrc/pytorch/cuda/points_in_polygons_cuda.cu
0 → 100644
View file @
fdeee889
// Copyright (c) OpenMMLab. All rights reserved
// Modified from
// https://github.com/ming71/CUDA/blob/master/point_justify/points_justify_kernel.cu
#include <stdio.h>
#include "points_in_polygons_cuda_kernel.cuh"
#include "pytorch_cuda_helper.hpp"
void
PointsInPolygonsForwardCUDAKernelLauncher
(
const
at
::
Tensor
points
,
const
at
::
Tensor
polygons
,
const
int
rows
,
const
int
cols
,
at
::
Tensor
output
)
{
const
int
output_size
=
rows
*
cols
;
at
::
cuda
::
CUDAGuard
device_guard
(
points
.
device
());
cudaStream_t
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
AT_DISPATCH_FLOATING_TYPES_AND_HALF
(
points
.
scalar_type
(),
"points_in_polygons_forward_cuda_kernel"
,
([
&
]
{
const
scalar_t
*
vertex1
=
points
.
data_ptr
<
scalar_t
>
();
const
scalar_t
*
vertex2
=
polygons
.
data_ptr
<
scalar_t
>
();
scalar_t
*
inside_flag
=
output
.
data_ptr
<
scalar_t
>
();
points_in_polygons_forward_cuda_kernel
<
scalar_t
>
<<<
GET_BLOCKS
(
output_size
),
THREADS_PER_BLOCK
,
0
,
stream
>>>
(
output_size
,
vertex1
,
vertex2
,
rows
,
cols
,
inside_flag
);
}));
AT_CUDA_CHECK
(
cudaGetLastError
());
}
mmcv/ops/csrc/pytorch/cuda/prroi_pool_cuda.cu
0 → 100644
View file @
fdeee889
// Copyright (c) OpenMMLab. All rights reserved
#include "prroi_pool_cuda_kernel.cuh"
#include "pytorch_cuda_helper.hpp"
void
PrROIPoolForwardCUDAKernelLauncher
(
Tensor
input
,
Tensor
rois
,
Tensor
output
,
int
pooled_height
,
int
pooled_width
,
float
spatial_scale
)
{
int
output_size
=
output
.
numel
();
int
channels
=
input
.
size
(
1
);
int
height
=
input
.
size
(
2
);
int
width
=
input
.
size
(
3
);
at
::
cuda
::
CUDAGuard
device_guard
(
input
.
device
());
cudaStream_t
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
prroi_pool_forward_cuda_kernel
<
float
>
<<<
GET_BLOCKS
(
output_size
),
THREADS_PER_BLOCK
,
0
,
stream
>>>
(
output_size
,
input
.
data_ptr
<
float
>
(),
rois
.
data_ptr
<
float
>
(),
output
.
data_ptr
<
float
>
(),
pooled_height
,
pooled_width
,
static_cast
<
float
>
(
spatial_scale
),
channels
,
height
,
width
);
AT_CUDA_CHECK
(
cudaGetLastError
());
}
void
PrROIPoolBackwardCUDAKernelLauncher
(
Tensor
grad_output
,
Tensor
rois
,
Tensor
grad_input
,
int
pooled_height
,
int
pooled_width
,
float
spatial_scale
)
{
int
output_size
=
grad_output
.
numel
();
int
channels
=
grad_input
.
size
(
1
);
int
height
=
grad_input
.
size
(
2
);
int
width
=
grad_input
.
size
(
3
);
at
::
cuda
::
CUDAGuard
device_guard
(
grad_output
.
device
());
cudaStream_t
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
prroi_pool_backward_cuda_kernel
<
float
>
<<<
GET_BLOCKS
(
output_size
),
THREADS_PER_BLOCK
,
0
,
stream
>>>
(
output_size
,
grad_output
.
data_ptr
<
float
>
(),
rois
.
data_ptr
<
float
>
(),
grad_input
.
data_ptr
<
float
>
(),
pooled_height
,
pooled_width
,
static_cast
<
float
>
(
spatial_scale
),
channels
,
height
,
width
);
AT_CUDA_CHECK
(
cudaGetLastError
());
}
void
PrROIPoolCoorBackwardCUDAKernelLauncher
(
Tensor
output
,
Tensor
grad_output
,
Tensor
input
,
Tensor
rois
,
Tensor
grad_rois
,
int
pooled_height
,
int
pooled_width
,
float
spatial_scale
)
{
int
output_size
=
grad_output
.
numel
();
int
channels
=
input
.
size
(
1
);
int
height
=
input
.
size
(
2
);
int
width
=
input
.
size
(
3
);
at
::
cuda
::
CUDAGuard
device_guard
(
grad_output
.
device
());
cudaStream_t
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
prroi_pool_coor_backward_cuda_kernel
<
float
>
<<<
GET_BLOCKS
(
output_size
),
THREADS_PER_BLOCK
,
0
,
stream
>>>
(
output_size
,
output
.
data_ptr
<
float
>
(),
grad_output
.
data_ptr
<
float
>
(),
input
.
data_ptr
<
float
>
(),
rois
.
data_ptr
<
float
>
(),
grad_rois
.
data_ptr
<
float
>
(),
pooled_height
,
pooled_width
,
static_cast
<
float
>
(
spatial_scale
),
channels
,
height
,
width
);
AT_CUDA_CHECK
(
cudaGetLastError
());
}
mmcv/ops/csrc/pytorch/cuda/psamask_cuda.cu
View file @
fdeee889
...
...
@@ -2,11 +2,8 @@
// Modified from
// https://github.com/hszhao/semseg/blob/master/lib/psa/src
#include <THC/THC.h>
#include <torch/serialize/tensor.h>
#include <THC/THCDeviceUtils.cuh>
#include "psamask_cuda_kernel.cuh"
#include "pytorch_cuda_helper.hpp"
...
...
mmcv/ops/csrc/pytorch/cuda/riroi_align_rotated_cuda.cu
0 → 100644
View file @
fdeee889
// Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cuda_helper.hpp"
#include "riroi_align_rotated_cuda_kernel.cuh"
void
RiROIAlignRotatedForwardCUDAKernelLauncher
(
const
at
::
Tensor
features
,
const
at
::
Tensor
rois
,
const
float
spatial_scale
,
const
int
num_samples
,
const
bool
clockwise
,
const
int
channels
,
const
int
height
,
const
int
width
,
const
int
num_rois
,
const
int
pooled_height
,
const
int
pooled_width
,
const
int
num_orientations
,
at
::
Tensor
output
)
{
const
int
output_size
=
num_rois
*
pooled_height
*
pooled_width
*
channels
*
num_orientations
;
at
::
cuda
::
CUDAGuard
device_guard
(
features
.
device
());
cudaStream_t
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
AT_DISPATCH_FLOATING_TYPES_AND_HALF
(
features
.
scalar_type
(),
"riroi_align_rotated_forward_cuda_kernel"
,
([
&
]
{
const
scalar_t
*
bottom_data
=
features
.
data_ptr
<
scalar_t
>
();
const
scalar_t
*
rois_data
=
rois
.
data_ptr
<
scalar_t
>
();
scalar_t
*
top_data
=
output
.
data_ptr
<
scalar_t
>
();
riroi_align_rotated_forward_cuda_kernel
<
scalar_t
>
<<<
GET_BLOCKS
(
output_size
),
THREADS_PER_BLOCK
,
0
,
stream
>>>
(
output_size
,
bottom_data
,
rois_data
,
scalar_t
(
spatial_scale
),
num_samples
,
clockwise
,
channels
,
height
,
width
,
pooled_height
,
pooled_width
,
num_orientations
,
top_data
);
}));
AT_CUDA_CHECK
(
cudaGetLastError
());
}
void
RiROIAlignRotatedBackwardCUDAKernelLauncher
(
const
at
::
Tensor
top_grad
,
const
at
::
Tensor
rois
,
const
float
spatial_scale
,
const
int
num_samples
,
const
bool
clockwise
,
const
int
channels
,
const
int
height
,
const
int
width
,
const
int
num_rois
,
const
int
pooled_height
,
const
int
pooled_width
,
const
int
num_orientations
,
at
::
Tensor
bottom_grad
)
{
const
int
output_size
=
num_rois
*
pooled_height
*
pooled_width
*
channels
*
num_orientations
;
at
::
cuda
::
CUDAGuard
device_guard
(
top_grad
.
device
());
cudaStream_t
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
AT_DISPATCH_FLOATING_TYPES_AND_HALF
(
top_grad
.
scalar_type
(),
"riroi_align_rotated_backward_cuda_kernel"
,
([
&
]
{
const
scalar_t
*
top_diff
=
top_grad
.
data_ptr
<
scalar_t
>
();
const
scalar_t
*
rois_data
=
rois
.
data_ptr
<
scalar_t
>
();
scalar_t
*
bottom_diff
=
bottom_grad
.
data_ptr
<
scalar_t
>
();
riroi_align_rotated_backward_cuda_kernel
<
scalar_t
>
<<<
GET_BLOCKS
(
output_size
),
THREADS_PER_BLOCK
,
0
,
stream
>>>
(
output_size
,
top_diff
,
rois_data
,
spatial_scale
,
num_samples
,
clockwise
,
channels
,
height
,
width
,
pooled_height
,
pooled_width
,
num_orientations
,
bottom_diff
);
}));
AT_CUDA_CHECK
(
cudaGetLastError
());
}
mmcv/ops/csrc/pytorch/cuda/roi_align_rotated_cuda.cu
View file @
fdeee889
...
...
@@ -3,21 +3,21 @@
#include "roi_align_rotated_cuda_kernel.cuh"
void
ROIAlignRotatedForwardCUDAKernelLauncher
(
const
at
::
Tensor
features
,
const
at
::
Tensor
rois
,
const
float
spatial_scale
,
const
int
sampl
e_num
,
const
bool
aligned
,
const
bool
clockwise
,
const
at
::
Tensor
input
,
const
at
::
Tensor
rois
,
const
float
spatial_scale
,
const
int
sampl
ing_ratio
,
const
bool
aligned
,
const
bool
clockwise
,
const
int
channels
,
const
int
height
,
const
int
width
,
const
int
num_rois
,
const
int
pooled_height
,
const
int
pooled_width
,
at
::
Tensor
output
)
{
const
int
output_size
=
num_rois
*
pooled_height
*
pooled_width
*
channels
;
AT_DISPATCH_FLOATING_TYPES_AND_HALF
(
features
.
scalar_type
(),
"ROIAlignRotatedLaucherForward"
,
([
&
]
{
const
scalar_t
*
bottom_data
=
features
.
data_ptr
<
scalar_t
>
();
input
.
scalar_type
(),
"ROIAlignRotatedLaucherForward"
,
([
&
]
{
const
scalar_t
*
bottom_data
=
input
.
data_ptr
<
scalar_t
>
();
const
scalar_t
*
rois_data
=
rois
.
data_ptr
<
scalar_t
>
();
scalar_t
*
top_data
=
output
.
data_ptr
<
scalar_t
>
();
roi_align_rotated_forward_cuda_kernel
<
scalar_t
>
<<<
GET_BLOCKS
(
output_size
),
THREADS_PER_BLOCK
>>>
(
output_size
,
bottom_data
,
rois_data
,
scalar_t
(
spatial_scale
),
sampl
e_num
,
aligned
,
clockwise
,
channels
,
height
,
width
,
sampl
ing_ratio
,
aligned
,
clockwise
,
channels
,
height
,
width
,
pooled_height
,
pooled_width
,
top_data
);
}));
...
...
@@ -26,7 +26,7 @@ void ROIAlignRotatedForwardCUDAKernelLauncher(
void
ROIAlignRotatedBackwardCUDAKernelLauncher
(
const
at
::
Tensor
top_grad
,
const
at
::
Tensor
rois
,
const
float
spatial_scale
,
const
int
sampl
e_num
,
const
bool
aligned
,
const
bool
clockwise
,
const
int
sampl
ing_ratio
,
const
bool
aligned
,
const
bool
clockwise
,
const
int
channels
,
const
int
height
,
const
int
width
,
const
int
num_rois
,
const
int
pooled_height
,
const
int
pooled_width
,
at
::
Tensor
bottom_grad
)
{
const
int
output_size
=
num_rois
*
pooled_height
*
pooled_width
*
channels
;
...
...
@@ -37,7 +37,7 @@ void ROIAlignRotatedBackwardCUDAKernelLauncher(
scalar_t
*
bottom_diff
=
bottom_grad
.
data_ptr
<
scalar_t
>
();
roi_align_rotated_backward_cuda_kernel
<
scalar_t
>
<<<
GET_BLOCKS
(
output_size
),
THREADS_PER_BLOCK
>>>
(
output_size
,
top_diff
,
rois_data
,
spatial_scale
,
sampl
e_num
,
output_size
,
top_diff
,
rois_data
,
spatial_scale
,
sampl
ing_ratio
,
aligned
,
clockwise
,
channels
,
height
,
width
,
pooled_height
,
pooled_width
,
bottom_diff
);
}));
...
...
mmcv/ops/csrc/pytorch/cuda/roiaware_pool3d_cuda.cu
View file @
fdeee889
...
...
@@ -26,7 +26,7 @@ void RoiawarePool3dForwardCUDAKernelLauncher(
Tensor
pts_mask
=
-
at
::
ones
({
boxes_num
,
pts_num
},
pts_feature
.
options
().
dtype
(
at
::
kInt
));
dim3
blocks_mask
(
DIVUP
(
pts_num
,
THREADS_PER_BLOCK
),
boxes_num
);
dim3
blocks_mask
(
GET_BLOCKS
(
pts_num
,
THREADS_PER_BLOCK
),
boxes_num
);
dim3
threads
(
THREADS_PER_BLOCK
);
AT_DISPATCH_FLOATING_TYPES_AND_HALF
(
...
...
@@ -42,7 +42,7 @@ void RoiawarePool3dForwardCUDAKernelLauncher(
// TODO: Merge the collect and pool functions, SS
dim3
blocks_collect
(
DIVUP
(
boxes_num
,
THREADS_PER_BLOCK
));
dim3
blocks_collect
(
GET_BLOCKS
(
boxes_num
,
THREADS_PER_BLOCK
));
AT_DISPATCH_INTEGRAL_TYPES
(
pts_idx_of_voxels
.
scalar_type
(),
"collect_inside_pts_for_box3d"
,
[
&
]
{
...
...
@@ -55,8 +55,8 @@ void RoiawarePool3dForwardCUDAKernelLauncher(
AT_CUDA_CHECK
(
cudaGetLastError
());
dim3
blocks_pool
(
DIVUP
(
out_x
*
out_y
*
out_z
,
THREADS_PER_BLOCK
),
channels
,
boxes_num
);
dim3
blocks_pool
(
GET_BLOCKS
(
out_x
*
out_y
*
out_z
,
THREADS_PER_BLOCK
),
channels
,
boxes_num
);
if
(
pool_method
==
0
)
{
AT_DISPATCH_FLOATING_TYPES_AND_HALF
(
pts_feature
.
scalar_type
(),
"roiaware_maxpool3d"
,
[
&
]
{
...
...
@@ -93,7 +93,7 @@ void RoiawarePool3dBackwardCUDAKernelLauncher(
at
::
cuda
::
CUDAGuard
device_guard
(
grad_out
.
device
());
cudaStream_t
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
dim3
blocks
(
DIVUP
(
out_x
*
out_y
*
out_z
,
THREADS_PER_BLOCK
),
channels
,
dim3
blocks
(
GET_BLOCKS
(
out_x
*
out_y
*
out_z
,
THREADS_PER_BLOCK
),
channels
,
boxes_num
);
dim3
threads
(
THREADS_PER_BLOCK
);
...
...
mmcv/ops/csrc/pytorch/cuda/roipoint_pool3d_cuda.cu
View file @
fdeee889
...
...
@@ -24,7 +24,7 @@ void RoIPointPool3dForwardCUDAKernelLauncher(
cudaStream_t
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
// blockIdx.x(col), blockIdx.y(row)
dim3
blocks
(
DIVUP
(
pts_num
,
THREADS_PER_BLOCK
),
boxes_num
,
batch_size
);
dim3
blocks
(
GET_BLOCKS
(
pts_num
,
THREADS_PER_BLOCK
),
boxes_num
,
batch_size
);
dim3
threads
(
THREADS_PER_BLOCK
);
AT_DISPATCH_FLOATING_TYPES_AND_HALF
(
...
...
@@ -38,14 +38,14 @@ void RoIPointPool3dForwardCUDAKernelLauncher(
boxes3d
.
options
().
dtype
(
at
::
kInt
));
// blockIdx.x(col), blockIdx.y(row)
dim3
blocks2
(
DIVUP
(
boxes_num
,
THREADS_PER_BLOCK
),
batch_size
);
dim3
blocks2
(
GET_BLOCKS
(
boxes_num
,
THREADS_PER_BLOCK
),
batch_size
);
get_pooled_idx
<<<
blocks2
,
threads
,
0
,
stream
>>>
(
batch_size
,
pts_num
,
boxes_num
,
sampled_pts_num
,
pts_assign
.
data_ptr
<
int
>
(),
pts_idx
.
data_ptr
<
int
>
(),
pooled_empty_flag
.
data_ptr
<
int
>
());
dim3
blocks_pool
(
DIVUP
(
sampled_pts_num
,
THREADS_PER_BLOCK
),
boxes_num
,
dim3
blocks_pool
(
GET_BLOCKS
(
sampled_pts_num
,
THREADS_PER_BLOCK
),
boxes_num
,
batch_size
);
AT_DISPATCH_FLOATING_TYPES_AND_HALF
(
...
...
mmcv/ops/csrc/pytorch/cuda/rotated_feature_align_cuda.cu
0 → 100644
View file @
fdeee889
// Copyright (c) OpenMMLab. All rights reserved.
// Modified from
// https://github.com/SJTU-Thinklab-Det/r3det-on-mmdetection/blob/master/mmdet/ops/fr/src/feature_refine_kernel.cu
#include "pytorch_cuda_helper.hpp"
#include "rotated_feature_align_cuda_kernel.cuh"
void
RotatedFeatureAlignForwardCUDAKernelLauncher
(
const
Tensor
features
,
const
Tensor
best_bboxes
,
const
float
spatial_scale
,
const
int
points
,
Tensor
output
)
{
at
::
cuda
::
CUDAGuard
device_guard
(
features
.
device
());
cudaStream_t
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
const
int
output_size
=
features
.
numel
();
AT_DISPATCH_FLOATING_TYPES_AND_HALF
(
features
.
scalar_type
(),
"rotated_feature_align_forward_cuda_kernel"
,
([
&
]
{
const
scalar_t
*
bottom_data
=
features
.
data_ptr
<
scalar_t
>
();
const
scalar_t
*
bboxes_data
=
best_bboxes
.
data_ptr
<
scalar_t
>
();
scalar_t
*
top_data
=
output
.
data_ptr
<
scalar_t
>
();
rotated_feature_align_forward_kernel
<
scalar_t
>
<<<
GET_BLOCKS
(
output_size
),
THREADS_PER_BLOCK
,
0
,
stream
>>>
(
output_size
,
points
,
bottom_data
,
bboxes_data
,
scalar_t
(
spatial_scale
),
features
.
size
(
1
),
features
.
size
(
2
),
features
.
size
(
3
),
top_data
);
}));
AT_CUDA_CHECK
(
cudaGetLastError
());
}
void
RotatedFeatureAlignBackwardCUDAKernelLauncher
(
const
Tensor
top_grad
,
const
Tensor
best_bboxes
,
const
float
spatial_scale
,
const
int
points
,
Tensor
bottom_grad
)
{
at
::
cuda
::
CUDAGuard
device_guard
(
top_grad
.
device
());
cudaStream_t
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
const
int
output_size
=
top_grad
.
numel
();
AT_DISPATCH_FLOATING_TYPES_AND_HALF
(
top_grad
.
scalar_type
(),
"rotated_feature_align_backward_cuda_kernel"
,
([
&
]
{
const
scalar_t
*
top_diff
=
top_grad
.
data_ptr
<
scalar_t
>
();
const
scalar_t
*
bboxes_data
=
best_bboxes
.
data_ptr
<
scalar_t
>
();
scalar_t
*
bottom_diff
=
bottom_grad
.
data_ptr
<
scalar_t
>
();
rotated_feature_align_backward_kernel
<
scalar_t
>
<<<
GET_BLOCKS
(
output_size
),
THREADS_PER_BLOCK
,
0
,
stream
>>>
(
output_size
,
points
,
top_diff
,
bboxes_data
,
scalar_t
(
spatial_scale
),
top_grad
.
size
(
1
),
top_grad
.
size
(
2
),
top_grad
.
size
(
3
),
bottom_diff
);
}));
AT_CUDA_CHECK
(
cudaGetLastError
());
}
mmcv/ops/csrc/pytorch/cuda/scatter_points_cuda.cu
View file @
fdeee889
...
...
@@ -26,10 +26,15 @@ std::vector<at::Tensor> DynamicPointToVoxelForwardCUDAKernelLauncher(
std
::
tie
(
out_coors
,
coors_map
,
reduce_count
)
=
at
::
unique_dim
(
coors_clean
,
0
,
true
,
true
,
true
);
// the first element of out_coors is always (-1,-1,-1) and should be removed
out_coors
=
out_coors
.
slice
(
0
,
1
);
reduce_count
=
reduce_count
.
slice
(
0
,
1
).
to
(
torch
::
kInt32
);
coors_map
=
coors_map
.
to
(
torch
::
kInt32
)
-
1
;
if
(
out_coors
[
0
][
0
].
lt
(
0
).
item
<
bool
>
())
{
// the first element of out_coors (-1,-1,-1) and should be removed
out_coors
=
out_coors
.
slice
(
0
,
1
);
reduce_count
=
reduce_count
.
slice
(
0
,
1
);
coors_map
=
coors_map
-
1
;
}
coors_map
=
coors_map
.
to
(
torch
::
kInt32
);
reduce_count
=
reduce_count
.
to
(
torch
::
kInt32
);
auto
reduced_feats
=
at
::
empty
({
out_coors
.
size
(
0
),
num_feats
},
feats
.
options
());
...
...
mmcv/ops/csrc/pytorch/cuda/sparse_indice.cu
0 → 100644
View file @
fdeee889
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <ATen/ATen.h>
#include <utils/spconv/spconv/indice.h>
#include <utils/spconv/spconv/mp_helper.h>
#include <utils/spconv/tensorview/helper_launch.h>
#include <utils/spconv/tensorview/tensorview.h>
#include <chrono>
#include <limits>
#include <spconv/indice.cuh>
#include <type_traits>
#include "../spconv_utils.h"
#include "pytorch_cuda_helper.hpp"
namespace
functor
{
template
<
typename
Index
,
typename
IndexGrid
,
unsigned
NDim
>
struct
CreateConvIndicePairFunctorP1
<
tv
::
TorchGPU
,
Index
,
IndexGrid
,
NDim
>
{
Index
operator
()(
const
tv
::
TorchGPU
&
d
,
tv
::
TensorView
<
const
Index
>
indicesIn
,
tv
::
TensorView
<
Index
>
indicesOut
,
tv
::
TensorView
<
IndexGrid
>
gridsOut
,
tv
::
TensorView
<
Index
>
indicePairs
,
tv
::
TensorView
<
Index
>
indiceNum
,
tv
::
TensorView
<
Index
>
indicePairUnique
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
kernelSize
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
stride
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
padding
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
dilation
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
outSpatialShape
,
bool
transpose
)
{
Index
batchSize
=
gridsOut
.
dim
(
0
);
auto
numActIn
=
indicesIn
.
dim
(
0
);
if
(
numActIn
==
0
)
return
0
;
if
(
transpose
)
prepareDeConvIndicePairsKernel
<
Index
,
IndexGrid
,
NDim
,
4096
>
<<<
tv
::
launch
::
getBlocks
(
numActIn
),
tv
::
launch
::
CUDA_NUM_THREADS
,
0
,
d
.
getStream
()
>>>
(
indicesIn
,
indicesOut
,
gridsOut
,
indicePairs
,
indiceNum
,
indicePairUnique
,
kernelSize
,
stride
,
padding
,
dilation
,
outSpatialShape
);
else
prepareIndicePairsKernel
<
Index
,
IndexGrid
,
NDim
,
4096
>
<<<
tv
::
launch
::
getBlocks
(
numActIn
),
tv
::
launch
::
CUDA_NUM_THREADS
,
0
,
d
.
getStream
()
>>>
(
indicesIn
,
indicesOut
,
gridsOut
,
indicePairs
,
indiceNum
,
indicePairUnique
,
kernelSize
,
stride
,
padding
,
dilation
,
outSpatialShape
);
TV_CHECK_CUDA_ERR
();
return
1
;
}
};
template
<
typename
Index
,
typename
IndexGrid
,
unsigned
NDim
>
struct
CreateConvIndicePairFunctorP2
<
tv
::
TorchGPU
,
Index
,
IndexGrid
,
NDim
>
{
Index
operator
()(
const
tv
::
TorchGPU
&
d
,
tv
::
TensorView
<
const
Index
>
indicesIn
,
tv
::
TensorView
<
Index
>
indicesOut
,
tv
::
TensorView
<
IndexGrid
>
gridsOut
,
tv
::
TensorView
<
Index
>
indicePairs
,
tv
::
TensorView
<
Index
>
indiceNum
,
tv
::
TensorView
<
Index
>
indicePairUnique
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
outSpatialShape
,
bool
transpose
,
bool
resetGrid
)
{
Index
batchSize
=
gridsOut
.
dim
(
0
);
auto
kernelVolume
=
indicePairs
.
dim
(
0
);
auto
numActIn
=
indicesIn
.
dim
(
0
);
if
(
numActIn
==
0
)
return
0
;
Index
numAct
=
indicePairUnique
.
dim
(
0
)
-
1
;
assignGridAndIndiceOutKernel
<
Index
,
IndexGrid
,
NDim
>
<<<
tv
::
launch
::
getBlocks
(
numAct
),
tv
::
launch
::
CUDA_NUM_THREADS
,
0
,
d
.
getStream
()
>>>
(
indicesOut
,
gridsOut
,
numAct
,
indicePairs
,
indicePairUnique
,
outSpatialShape
,
batchSize
);
TV_CHECK_CUDA_ERR
();
assignIndicePairsKernel
<
Index
,
IndexGrid
,
NDim
>
<<<
tv
::
launch
::
getBlocks
(
numActIn
),
tv
::
launch
::
CUDA_NUM_THREADS
,
0
,
d
.
getStream
()
>>>
(
indicesOut
,
gridsOut
,
numActIn
,
indicePairs
,
indicePairUnique
,
outSpatialShape
);
TV_CHECK_CUDA_ERR
();
if
(
resetGrid
)
{
resetGridKernel
<
Index
,
IndexGrid
,
NDim
>
<<<
tv
::
launch
::
getBlocks
(
numAct
),
tv
::
launch
::
CUDA_NUM_THREADS
,
0
,
d
.
getStream
()
>>>
(
indicePairUnique
.
data
(),
gridsOut
,
numAct
);
TV_CHECK_CUDA_ERR
();
}
return
numAct
;
}
};
template
<
typename
Index
,
typename
IndexGrid
,
unsigned
NDim
>
struct
CreateSubMIndicePairFunctor
<
tv
::
TorchGPU
,
Index
,
IndexGrid
,
NDim
>
{
Index
operator
()(
const
tv
::
TorchGPU
&
d
,
tv
::
TensorView
<
const
Index
>
indicesIn
,
tv
::
TensorView
<
IndexGrid
>
gridsOut
,
tv
::
TensorView
<
Index
>
indicePairs
,
tv
::
TensorView
<
Index
>
indiceNum
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
kernelSize
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
stride
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
padding
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
dilation
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
outSpatialShape
,
bool
transpose
,
bool
resetGrid
)
{
auto
numActIn
=
indicesIn
.
dim
(
0
);
if
(
numActIn
==
0
)
return
0
;
prepareSubMGridKernel
<
Index
,
IndexGrid
,
NDim
>
<<<
tv
::
launch
::
getBlocks
(
numActIn
),
tv
::
launch
::
CUDA_NUM_THREADS
,
0
,
d
.
getStream
()
>>>
(
indicesIn
,
gridsOut
,
outSpatialShape
);
TV_CHECK_CUDA_ERR
();
getSubMIndicePairsKernel
<
Index
,
IndexGrid
,
NDim
,
4096
>
<<<
tv
::
launch
::
getBlocks
(
numActIn
),
tv
::
launch
::
CUDA_NUM_THREADS
,
0
,
d
.
getStream
()
>>>
(
indicesIn
,
gridsOut
,
indicePairs
,
indiceNum
,
kernelSize
,
stride
,
padding
,
dilation
,
outSpatialShape
);
TV_CHECK_CUDA_ERR
();
if
(
resetGrid
)
{
resetGridSubMKernel
<
Index
,
IndexGrid
,
NDim
>
<<<
tv
::
launch
::
getBlocks
(
numActIn
),
tv
::
launch
::
CUDA_NUM_THREADS
,
0
,
d
.
getStream
()
>>>
(
indicesIn
.
data
(),
gridsOut
,
outSpatialShape
,
numActIn
);
TV_CHECK_CUDA_ERR
();
}
return
numActIn
;
}
};
}
// namespace functor
#define DECLARE_GPU_SPECS_INDEX_NDIM(Index, NDIM) \
template struct functor::CreateConvIndicePairFunctor<tv::TorchGPU, Index, \
int, NDIM>; \
template struct functor::CreateConvIndicePairFunctorP1<tv::TorchGPU, Index, \
int, NDIM>; \
template struct functor::CreateConvIndicePairFunctorP2<tv::TorchGPU, Index, \
int, NDIM>; \
template struct functor::CreateSubMIndicePairFunctor<tv::TorchGPU, Index, \
int, NDIM>;
#define DECLARE_GPU_INDEX(Index) \
DECLARE_GPU_SPECS_INDEX_NDIM(Index, 1); \
DECLARE_GPU_SPECS_INDEX_NDIM(Index, 2); \
DECLARE_GPU_SPECS_INDEX_NDIM(Index, 3); \
DECLARE_GPU_SPECS_INDEX_NDIM(Index, 4);
DECLARE_GPU_INDEX
(
int
);
#undef DECLARE_GPU_INDEX
#undef DECLARE_GPU_SPECS_INDEX_NDIM
mmcv/ops/csrc/pytorch/cuda/sparse_maxpool.cu
0 → 100644
View file @
fdeee889
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <ATen/ATen.h>
#include <utils/spconv/spconv/maxpool.h>
#include <utils/spconv/spconv/mp_helper.h>
#include <utils/spconv/tensorview/helper_launch.h>
#include <utils/spconv/tensorview/tensorview.h>
#include <chrono>
#include <limits>
#include <type_traits>
#include <utils/spconv/tensorview/helper_kernel.cuh>
#include "../spconv_utils.h"
#include "pytorch_cuda_helper.hpp"
template
<
typename
scalar_t
,
typename
Index
,
int
NumTLP
,
int
NumILP
>
__global__
void
maxPoolFwdBlockKernel
(
scalar_t
*
outFeatures
,
const
scalar_t
*
inFeatures
,
const
Index
*
indicesIn
,
const
Index
*
indicesOut
,
int
numHot
,
int
numPlanes
)
{
scalar_t
in
,
out
;
int
ILPStrideY
[
NumILP
];
Index
idxo
,
idxi
;
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
ILPStrideY
[
ilp
]
=
threadIdx
.
y
+
ilp
*
blockDim
.
y
;
outFeatures
+=
blockIdx
.
y
*
NumTLP
;
inFeatures
+=
blockIdx
.
y
*
NumTLP
;
for
(
int
ix
=
blockIdx
.
x
*
blockDim
.
x
;
ix
<
numHot
;
ix
+=
blockDim
.
x
*
gridDim
.
x
)
{
{
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
++
ilp
)
{
idxi
=
indicesIn
[
ix
+
ILPStrideY
[
ilp
]]
*
numPlanes
+
threadIdx
.
x
;
idxo
=
indicesOut
[
ix
+
ILPStrideY
[
ilp
]]
*
numPlanes
+
threadIdx
.
x
;
in
=
inFeatures
[
idxi
];
out
=
outFeatures
[
idxo
];
if
(
in
>
out
)
{
outFeatures
[
idxo
]
=
in
;
}
}
}
}
}
template
<
typename
scalar_t
,
typename
Index
,
int
NumTLP
,
int
NumILP
>
__global__
void
maxPoolFwdGenericBlockKernel
(
scalar_t
*
outFeatures
,
const
scalar_t
*
inFeatures
,
const
Index
*
indicesIn
,
const
Index
*
indicesOut
,
int
numHot
,
int
numPlanes
)
{
int
ILPStrideX
[
NumILP
];
Index
RI
[
NumILP
];
Index
RO
[
NumILP
];
scalar_t
in
,
out
;
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
ILPStrideX
[
ilp
]
=
ilp
*
gridDim
.
x
*
blockDim
.
x
;
for
(
int
ix
:
tv
::
KernelLoopX
<
int
,
NumILP
>
(
numHot
))
{
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
{
RI
[
ilp
]
=
indicesIn
[
ix
+
ILPStrideX
[
ilp
]]
*
numPlanes
;
RO
[
ilp
]
=
indicesOut
[
ix
+
ILPStrideX
[
ilp
]]
*
numPlanes
;
}
for
(
int
iy
:
tv
::
KernelLoopY
<
int
>
(
numPlanes
))
{
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
++
ilp
)
{
in
=
inFeatures
[
RI
[
ilp
]
+
iy
];
out
=
outFeatures
[
RO
[
ilp
]
+
iy
];
if
(
in
>
out
)
{
outFeatures
[
RO
[
ilp
]
+
iy
]
=
in
;
}
}
}
}
}
template
<
typename
scalar_t
,
typename
Index
,
int
NumTLP
,
int
NumILP
,
typename
VecType
>
__global__
void
maxPoolFwdVecBlockKernel
(
scalar_t
*
outFeatures
,
const
scalar_t
*
inFeatures
,
const
Index
*
indicesIn
,
const
Index
*
indicesOut
,
int
numHot
,
int
numPlanes
)
{
int
ILPStrideY
[
NumILP
];
constexpr
int
vecloadFactor
=
sizeof
(
VecType
)
/
sizeof
(
scalar_t
);
scalar_t
bufi
[
vecloadFactor
];
scalar_t
bufo
[
vecloadFactor
];
Index
idxi
,
idxo
;
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
ILPStrideY
[
ilp
]
=
threadIdx
.
y
+
ilp
*
blockDim
.
y
;
outFeatures
+=
blockIdx
.
y
*
NumTLP
;
inFeatures
+=
blockIdx
.
y
*
NumTLP
;
for
(
int
ix
=
blockIdx
.
x
*
blockDim
.
x
*
vecloadFactor
;
ix
<
numHot
;
ix
+=
blockDim
.
x
*
gridDim
.
x
*
vecloadFactor
)
{
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
++
ilp
)
{
idxi
=
indicesIn
[
ix
+
ILPStrideY
[
ilp
]]
*
numPlanes
+
threadIdx
.
x
;
idxo
=
indicesOut
[
ix
+
ILPStrideY
[
ilp
]]
*
numPlanes
+
threadIdx
.
x
;
reinterpret_cast
<
VecType
*>
(
bufo
)[
0
]
=
reinterpret_cast
<
VecType
*>
(
outFeatures
)[
idxo
];
reinterpret_cast
<
VecType
*>
(
bufi
)[
0
]
=
reinterpret_cast
<
const
VecType
*>
(
inFeatures
)[
idxi
];
#pragma unroll
for
(
int
i
=
0
;
i
<
vecloadFactor
;
i
++
)
{
if
(
bufi
[
i
]
>
bufo
[
i
])
{
bufo
[
i
]
=
bufi
[
i
];
}
}
reinterpret_cast
<
VecType
*>
(
outFeatures
)[
idxo
]
=
reinterpret_cast
<
VecType
*>
(
bufo
)[
0
];
}
}
}
template
<
typename
scalar_t
,
typename
Index
,
int
NumTLP
,
int
NumILP
>
__global__
void
maxPoolFwdGenericKernel
(
scalar_t
*
outFeatures
,
const
scalar_t
*
inFeatures
,
const
Index
*
indicesIn
,
const
Index
*
indicesOut
,
int
numHot
,
int
numPlanes
)
{
int
ILPStrideX
[
NumILP
];
Index
RI
[
NumILP
];
Index
RO
[
NumILP
];
scalar_t
in
,
out
;
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
ILPStrideX
[
ilp
]
=
ilp
*
gridDim
.
x
*
blockDim
.
x
;
for
(
int
ix
:
tv
::
KernelLoopX
<
int
,
NumILP
>
(
numHot
))
{
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
{
if
(
ix
+
ILPStrideX
[
ilp
]
<
numHot
)
{
RI
[
ilp
]
=
indicesIn
[
ix
+
ILPStrideX
[
ilp
]]
*
numPlanes
;
RO
[
ilp
]
=
indicesOut
[
ix
+
ILPStrideX
[
ilp
]]
*
numPlanes
;
}
}
for
(
int
iy
:
tv
::
KernelLoopY
<
int
>
(
numPlanes
))
{
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
++
ilp
)
{
if
(
ix
+
ILPStrideX
[
ilp
]
<
numHot
)
{
in
=
inFeatures
[
RI
[
ilp
]
+
iy
];
out
=
outFeatures
[
RO
[
ilp
]
+
iy
];
if
(
in
>
out
)
{
outFeatures
[
RO
[
ilp
]
+
iy
]
=
in
;
}
}
}
}
}
}
template
<
typename
scalar_t
,
typename
Index
,
int
NumTLP
,
int
NumILP
>
__global__
void
maxPoolBwdBlockKernel
(
const
scalar_t
*
outFeatures
,
const
scalar_t
*
inFeatures
,
const
scalar_t
*
fout
,
scalar_t
*
fin
,
const
Index
*
indicesIn
,
const
Index
*
indicesOut
,
int
numHot
,
int
numPlanes
)
{
scalar_t
in
,
out
;
Index
idxo
,
idxi
;
int
ILPStrideY
[
NumILP
];
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
ILPStrideY
[
ilp
]
=
threadIdx
.
y
+
ilp
*
blockDim
.
y
;
outFeatures
+=
blockIdx
.
y
*
NumTLP
;
inFeatures
+=
blockIdx
.
y
*
NumTLP
;
fout
+=
blockIdx
.
y
*
NumTLP
;
fin
+=
blockIdx
.
y
*
NumTLP
;
for
(
int
ix
=
blockIdx
.
x
*
blockDim
.
x
;
ix
<
numHot
;
ix
+=
blockDim
.
x
*
gridDim
.
x
)
{
{
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
++
ilp
)
{
idxi
=
indicesIn
[
ix
+
ILPStrideY
[
ilp
]]
*
numPlanes
+
threadIdx
.
x
;
idxo
=
indicesOut
[
ix
+
ILPStrideY
[
ilp
]]
*
numPlanes
+
threadIdx
.
x
;
in
=
inFeatures
[
idxi
];
out
=
outFeatures
[
idxo
];
if
(
in
==
out
)
{
fin
[
idxi
]
+=
fout
[
idxo
];
}
}
}
}
}
template
<
typename
scalar_t
,
typename
Index
,
int
NumTLP
,
int
NumILP
>
__global__
void
maxPoolBwdGenericBlockKernel
(
const
scalar_t
*
outFeatures
,
const
scalar_t
*
inFeatures
,
const
scalar_t
*
fout
,
scalar_t
*
fin
,
const
Index
*
indicesIn
,
const
Index
*
indicesOut
,
int
numHot
,
int
numPlanes
)
{
int
ILPStrideX
[
NumILP
];
Index
RI
[
NumILP
];
Index
RO
[
NumILP
];
scalar_t
in
,
out
;
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
ILPStrideX
[
ilp
]
=
ilp
*
gridDim
.
x
*
blockDim
.
x
;
for
(
int
ix
:
tv
::
KernelLoopX
<
int
,
NumILP
>
(
numHot
))
{
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
{
RI
[
ilp
]
=
indicesIn
[
ix
+
ILPStrideX
[
ilp
]]
*
numPlanes
;
RO
[
ilp
]
=
indicesOut
[
ix
+
ILPStrideX
[
ilp
]]
*
numPlanes
;
}
for
(
int
iy
:
tv
::
KernelLoopY
<
int
>
(
numPlanes
))
{
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
++
ilp
)
{
in
=
inFeatures
[
RI
[
ilp
]
+
iy
];
out
=
outFeatures
[
RO
[
ilp
]
+
iy
];
if
(
in
==
out
)
{
fin
[
RI
[
ilp
]
+
iy
]
+=
fout
[
RO
[
ilp
]
+
iy
];
}
}
}
}
}
template
<
typename
scalar_t
,
typename
Index
,
int
NumTLP
,
int
NumILP
,
typename
VecType
>
__global__
void
maxPoolBwdVecBlockKernel
(
const
scalar_t
*
outFeatures
,
const
scalar_t
*
inFeatures
,
const
scalar_t
*
fout
,
scalar_t
*
fin
,
const
Index
*
indicesIn
,
const
Index
*
indicesOut
,
int
numHot
,
int
numPlanes
)
{
int
ILPStrideY
[
NumILP
];
constexpr
int
vecloadFactor
=
sizeof
(
VecType
)
/
sizeof
(
scalar_t
);
scalar_t
bufi
[
vecloadFactor
];
scalar_t
bufo
[
vecloadFactor
];
scalar_t
bufdi
[
vecloadFactor
];
scalar_t
bufdo
[
vecloadFactor
];
Index
idxi
,
idxo
;
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
ILPStrideY
[
ilp
]
=
threadIdx
.
y
+
ilp
*
blockDim
.
y
;
outFeatures
+=
blockIdx
.
y
*
NumTLP
;
inFeatures
+=
blockIdx
.
y
*
NumTLP
;
for
(
int
ix
=
blockIdx
.
x
*
blockDim
.
x
*
vecloadFactor
;
ix
<
numHot
;
ix
+=
blockDim
.
x
*
gridDim
.
x
*
vecloadFactor
)
{
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
++
ilp
)
{
idxi
=
indicesIn
[
ix
+
ILPStrideY
[
ilp
]]
*
numPlanes
+
threadIdx
.
x
;
idxo
=
indicesOut
[
ix
+
ILPStrideY
[
ilp
]]
*
numPlanes
+
threadIdx
.
x
;
reinterpret_cast
<
VecType
*>
(
bufo
)[
0
]
=
reinterpret_cast
<
const
VecType
*>
(
outFeatures
)[
idxo
];
reinterpret_cast
<
VecType
*>
(
bufi
)[
0
]
=
reinterpret_cast
<
const
VecType
*>
(
inFeatures
)[
idxi
];
reinterpret_cast
<
VecType
*>
(
bufdo
)[
0
]
=
reinterpret_cast
<
const
VecType
*>
(
fout
)[
idxo
];
reinterpret_cast
<
VecType
*>
(
bufdi
)[
0
]
=
reinterpret_cast
<
VecType
*>
(
fin
)[
idxi
];
#pragma unroll
for
(
int
i
=
0
;
i
<
vecloadFactor
;
i
++
)
{
if
(
bufi
[
i
]
==
bufo
[
i
])
{
bufdi
[
i
]
+=
bufdo
[
i
];
}
}
reinterpret_cast
<
VecType
*>
(
fin
)[
idxi
]
=
reinterpret_cast
<
VecType
*>
(
bufdi
)[
0
];
}
}
}
template
<
typename
scalar_t
,
typename
Index
,
int
NumTLP
,
int
NumILP
>
__global__
void
maxPoolBwdGenericKernel
(
const
scalar_t
*
outFeatures
,
const
scalar_t
*
inFeatures
,
const
scalar_t
*
fout
,
scalar_t
*
fin
,
const
Index
*
indicesIn
,
const
Index
*
indicesOut
,
int
numHot
,
int
numPlanes
)
{
int
ILPStrideX
[
NumILP
];
Index
RI
[
NumILP
];
Index
RO
[
NumILP
];
scalar_t
in
,
out
;
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
ILPStrideX
[
ilp
]
=
ilp
*
gridDim
.
x
*
blockDim
.
x
;
for
(
int
ix
:
tv
::
KernelLoopX
<
int
,
NumILP
>
(
numHot
))
{
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
{
if
(
ix
+
ILPStrideX
[
ilp
]
<
numHot
)
{
RI
[
ilp
]
=
indicesIn
[
ix
+
ILPStrideX
[
ilp
]]
*
numPlanes
;
RO
[
ilp
]
=
indicesOut
[
ix
+
ILPStrideX
[
ilp
]]
*
numPlanes
;
}
}
for
(
int
iy
:
tv
::
KernelLoopY
<
int
>
(
numPlanes
))
{
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
++
ilp
)
{
if
(
ix
+
ILPStrideX
[
ilp
]
<
numHot
)
{
in
=
inFeatures
[
RI
[
ilp
]
+
iy
];
out
=
outFeatures
[
RO
[
ilp
]
+
iy
];
if
(
in
==
out
)
{
fin
[
RI
[
ilp
]
+
iy
]
+=
fout
[
RO
[
ilp
]
+
iy
];
}
}
}
}
}
}
namespace
functor
{
template
<
typename
scalar_t
,
typename
Index
>
struct
SparseMaxPoolForwardFunctor
<
tv
::
TorchGPU
,
scalar_t
,
Index
>
{
using
vecload_type_t
=
std
::
conditional_t
<
std
::
is_same
<
scalar_t
,
at
::
Half
>::
value
,
int2
,
int4
>
;
using
kernel_block_t
=
mp_list_c
<
int
,
64
,
32
,
16
>
;
void
operator
()(
const
tv
::
TorchGPU
&
d
,
tv
::
TensorView
<
scalar_t
>
outFeatures
,
tv
::
TensorView
<
const
scalar_t
>
inFeatures
,
tv
::
TensorView
<
const
Index
>
indices
,
int
size
)
{
if
(
size
<=
0
)
return
;
int
numPlanes
=
inFeatures
.
dim
(
1
);
bool
notFound
=
true
;
constexpr
int
vecloadFactor
=
sizeof
(
vecload_type_t
)
/
sizeof
(
scalar_t
);
mp_for_each
<
kernel_block_t
>
([
=
,
&
outFeatures
,
&
inFeatures
,
&
indices
,
&
notFound
](
auto
NumTLP
)
{
constexpr
int
NumILP
=
NumTLP
/
4
;
int
numHotBlock
=
(
size
/
NumTLP
)
*
NumTLP
;
if
(
notFound
)
{
if
(
numPlanes
%
NumTLP
==
0
)
{
if
(
numHotBlock
>=
NumTLP
)
{
maxPoolFwdVecBlockKernel
<
scalar_t
,
Index
,
int
(
NumTLP
),
NumILP
,
vecload_type_t
>
<<<
dim3
(
std
::
min
(
size
/
NumTLP
,
512
),
numPlanes
/
NumTLP
),
dim3
(
NumTLP
/
vecloadFactor
,
NumTLP
/
NumILP
),
0
,
d
.
getStream
()
>>>
(
outFeatures
.
data
(),
inFeatures
.
data
(),
indices
.
subview
(
0
).
data
(),
indices
.
subview
(
1
).
data
(),
numHotBlock
,
numPlanes
/
vecloadFactor
);
TV_CHECK_CUDA_ERR
();
}
if
(
size
>
numHotBlock
)
{
maxPoolFwdGenericKernel
<
scalar_t
,
Index
,
int
(
NumTLP
),
NumILP
>
<<<
dim3
(
1
,
numPlanes
/
NumTLP
),
dim3
(
NumTLP
/
NumILP
,
NumTLP
),
0
,
d
.
getStream
()
>>>
(
outFeatures
.
data
(),
inFeatures
.
data
(),
indices
.
subview
(
0
).
data
()
+
numHotBlock
,
indices
.
subview
(
1
).
data
()
+
numHotBlock
,
size
-
numHotBlock
,
numPlanes
);
TV_CHECK_CUDA_ERR
();
}
notFound
=
false
;
}
}
});
if
(
notFound
)
{
constexpr
int
NumTLP
=
64
;
constexpr
int
NumILP
=
NumTLP
/
4
;
int
numHotBlock
=
(
size
/
NumTLP
)
*
NumTLP
;
if
(
numHotBlock
>=
NumTLP
)
{
maxPoolFwdGenericBlockKernel
<
scalar_t
,
Index
,
NumTLP
,
NumILP
>
<<<
dim3
(
size
/
NumTLP
,
tv
::
launch
::
DivUp
(
numPlanes
,
NumTLP
)),
dim3
(
NumTLP
/
NumILP
,
NumTLP
),
0
,
d
.
getStream
()
>>>
(
outFeatures
.
data
(),
inFeatures
.
data
(),
indices
.
subview
(
0
).
data
(),
indices
.
subview
(
1
).
data
(),
numHotBlock
,
numPlanes
);
TV_CHECK_CUDA_ERR
();
}
if
(
size
>
numHotBlock
)
{
maxPoolFwdGenericKernel
<
scalar_t
,
Index
,
NumTLP
,
NumILP
>
<<<
dim3
(
1
,
tv
::
launch
::
DivUp
(
numPlanes
,
NumTLP
)),
dim3
(
NumTLP
/
NumILP
,
NumTLP
),
0
,
d
.
getStream
()
>>>
(
outFeatures
.
data
(),
inFeatures
.
data
(),
indices
.
subview
(
0
).
data
()
+
numHotBlock
,
indices
.
subview
(
1
).
data
()
+
numHotBlock
,
size
-
numHotBlock
,
numPlanes
);
TV_CHECK_CUDA_ERR
();
}
}
}
};
template
<
typename
scalar_t
,
typename
Index
>
struct
SparseMaxPoolBackwardFunctor
<
tv
::
TorchGPU
,
scalar_t
,
Index
>
{
using
vecload_type_t
=
std
::
conditional_t
<
std
::
is_same
<
scalar_t
,
at
::
Half
>::
value
,
int2
,
int4
>
;
using
kernel_block_t
=
mp_list_c
<
int
,
64
,
32
,
16
>
;
void
operator
()(
const
tv
::
TorchGPU
&
d
,
tv
::
TensorView
<
const
scalar_t
>
outFeatures
,
tv
::
TensorView
<
const
scalar_t
>
inFeatures
,
tv
::
TensorView
<
const
scalar_t
>
fout
,
tv
::
TensorView
<
scalar_t
>
fin
,
tv
::
TensorView
<
const
Index
>
indices
,
int
size
)
{
if
(
size
<=
0
)
return
;
int
numPlanes
=
inFeatures
.
dim
(
1
);
bool
notFound
=
true
;
constexpr
int
vecloadFactor
=
sizeof
(
vecload_type_t
)
/
sizeof
(
scalar_t
);
mp_for_each
<
kernel_block_t
>
([
=
,
&
outFeatures
,
&
inFeatures
,
&
fout
,
&
fin
,
&
indices
,
&
notFound
](
auto
NumTLP
)
{
constexpr
int
NumILP
=
NumTLP
/
4
;
int
numHotBlock
=
(
size
/
NumTLP
)
*
NumTLP
;
if
(
notFound
)
{
if
(
numPlanes
%
NumTLP
==
0
)
{
if
(
numHotBlock
>=
NumTLP
)
{
maxPoolBwdVecBlockKernel
<
scalar_t
,
Index
,
int
(
NumTLP
),
NumILP
,
vecload_type_t
>
<<<
dim3
(
std
::
min
(
size
/
NumTLP
,
512
),
numPlanes
/
NumTLP
),
dim3
(
NumTLP
/
vecloadFactor
,
NumTLP
/
NumILP
),
0
,
d
.
getStream
()
>>>
(
outFeatures
.
data
(),
inFeatures
.
data
(),
fout
.
data
(),
fin
.
data
(),
indices
.
subview
(
0
).
data
(),
indices
.
subview
(
1
).
data
(),
numHotBlock
,
numPlanes
/
vecloadFactor
);
TV_CHECK_CUDA_ERR
();
}
if
(
size
>
numHotBlock
)
{
maxPoolBwdGenericKernel
<
scalar_t
,
Index
,
int
(
NumTLP
),
NumILP
>
<<<
dim3
(
1
,
numPlanes
/
NumTLP
),
dim3
(
NumTLP
/
NumILP
,
NumTLP
),
0
,
d
.
getStream
()
>>>
(
outFeatures
.
data
(),
inFeatures
.
data
(),
fout
.
data
(),
fin
.
data
(),
indices
.
subview
(
0
).
data
()
+
numHotBlock
,
indices
.
subview
(
1
).
data
()
+
numHotBlock
,
size
-
numHotBlock
,
numPlanes
);
TV_CHECK_CUDA_ERR
();
}
notFound
=
false
;
}
}
});
if
(
notFound
)
{
constexpr
int
NumTLP
=
64
;
constexpr
int
NumILP
=
NumTLP
/
4
;
int
numHotBlock
=
(
size
/
NumTLP
)
*
NumTLP
;
if
(
numHotBlock
>=
NumTLP
)
{
maxPoolBwdGenericBlockKernel
<
scalar_t
,
Index
,
NumTLP
,
NumILP
>
<<<
dim3
(
size
/
NumTLP
,
tv
::
launch
::
DivUp
(
numPlanes
,
NumTLP
)),
dim3
(
NumTLP
/
NumILP
,
NumTLP
),
0
,
d
.
getStream
()
>>>
(
outFeatures
.
data
(),
inFeatures
.
data
(),
fout
.
data
(),
fin
.
data
(),
indices
.
subview
(
0
).
data
(),
indices
.
subview
(
1
).
data
(),
numHotBlock
,
numPlanes
);
TV_CHECK_CUDA_ERR
();
}
if
(
size
>
numHotBlock
)
{
maxPoolBwdGenericKernel
<
scalar_t
,
Index
,
NumTLP
,
NumILP
>
<<<
dim3
(
1
,
tv
::
launch
::
DivUp
(
numPlanes
,
NumTLP
)),
dim3
(
NumTLP
/
NumILP
,
NumTLP
),
0
,
d
.
getStream
()
>>>
(
outFeatures
.
data
(),
inFeatures
.
data
(),
fout
.
data
(),
fin
.
data
(),
indices
.
subview
(
0
).
data
()
+
numHotBlock
,
indices
.
subview
(
1
).
data
()
+
numHotBlock
,
size
-
numHotBlock
,
numPlanes
);
TV_CHECK_CUDA_ERR
();
}
}
}
};
}
// namespace functor
#define DECLARE_GPU_SPECS_T_INDEX(scalar_t, Index) \
template struct functor::SparseMaxPoolForwardFunctor<tv::TorchGPU, scalar_t, \
Index>; \
template struct functor::SparseMaxPoolBackwardFunctor<tv::TorchGPU, \
scalar_t, Index>;
#define DECLARE_GPU_SPECS(scalar_t) DECLARE_GPU_SPECS_T_INDEX(scalar_t, int);
DECLARE_GPU_SPECS
(
float
);
DECLARE_GPU_SPECS
(
double
);
DECLARE_GPU_SPECS
(
at
::
Half
);
#undef DECLARE_GPU_SPECS
#undef DECLARE_GPU_SPECS_T_INDEX
mmcv/ops/csrc/pytorch/cuda/sparse_pool_ops_cuda.cu
0 → 100644
View file @
fdeee889
#include <cuda_runtime_api.h>
#include <torch/script.h>
#include <utils/spconv/spconv/maxpool.h>
#include "../spconv_utils.h"
#include "pytorch_cuda_helper.hpp"
torch
::
Tensor
IndiceMaxpoolForwardCUDAKernelLauncher
(
torch
::
Tensor
features
,
torch
::
Tensor
indicePairs
,
torch
::
Tensor
indiceNum
,
int64_t
numAct
)
{
at
::
cuda
::
CUDAGuard
device_guard
(
features
.
device
());
auto
device
=
features
.
device
().
type
();
auto
kernelVolume
=
indicePairs
.
size
(
0
);
auto
numInPlanes
=
features
.
size
(
1
);
auto
indicePairNumCpu
=
indiceNum
.
to
({
torch
::
kCPU
});
auto
options
=
torch
::
TensorOptions
().
dtype
(
features
.
dtype
()).
device
(
features
.
device
());
torch
::
Tensor
output
=
torch
::
zeros
({
numAct
,
numInPlanes
},
options
);
double
totalTime
=
0
;
for
(
int
i
=
0
;
i
<
kernelVolume
;
++
i
)
{
auto
nHot
=
indicePairNumCpu
.
data_ptr
<
int
>
()[
i
];
if
(
nHot
<=
0
)
{
continue
;
}
AT_DISPATCH_FLOATING_TYPES_AND_HALF
(
features
.
scalar_type
(),
"IndiceMaxpoolForwardKernel"
,
[
&
]
{
if
(
device
==
torch
::
kCPU
)
{
functor
::
SparseMaxPoolForwardFunctor
<
tv
::
CPU
,
scalar_t
,
int
>
forwardFtor
;
forwardFtor
(
tv
::
CPU
(),
tv
::
torch2tv
<
scalar_t
>
(
output
),
tv
::
torch2tv
<
const
scalar_t
>
(
features
),
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
),
nHot
);
}
else
{
functor
::
SparseMaxPoolForwardFunctor
<
tv
::
TorchGPU
,
scalar_t
,
int
>
forwardFtor
;
forwardFtor
(
tv
::
TorchGPU
(),
tv
::
torch2tv
<
scalar_t
>
(
output
),
tv
::
torch2tv
<
const
scalar_t
>
(
features
),
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
),
nHot
);
TV_CHECK_CUDA_ERR
();
}
});
}
return
output
;
}
torch
::
Tensor
IndiceMaxpoolBackwardCUDAKernelLauncher
(
torch
::
Tensor
features
,
torch
::
Tensor
outFeatures
,
torch
::
Tensor
outGrad
,
torch
::
Tensor
indicePairs
,
torch
::
Tensor
indiceNum
)
{
at
::
cuda
::
CUDAGuard
device_guard
(
features
.
device
());
auto
device
=
features
.
device
().
type
();
auto
numInPlanes
=
features
.
size
(
1
);
auto
indicePairNumCpu
=
indiceNum
.
to
({
torch
::
kCPU
});
auto
options
=
torch
::
TensorOptions
().
dtype
(
features
.
dtype
()).
device
(
features
.
device
());
torch
::
Tensor
inputGrad
=
torch
::
zeros
(
features
.
sizes
(),
options
);
auto
kernelVolume
=
indicePairs
.
size
(
0
);
for
(
int
i
=
0
;
i
<
kernelVolume
;
++
i
)
{
auto
nHot
=
indicePairNumCpu
.
data_ptr
<
int
>
()[
i
];
if
(
nHot
<=
0
)
{
continue
;
}
AT_DISPATCH_FLOATING_TYPES_AND_HALF
(
features
.
scalar_type
(),
"IndiceMaxpoolBackwardKernel"
,
[
&
]
{
if
(
device
==
torch
::
kCPU
)
{
functor
::
SparseMaxPoolBackwardFunctor
<
tv
::
CPU
,
scalar_t
,
int
>
backwardFtor
;
backwardFtor
(
tv
::
CPU
(),
tv
::
torch2tv
<
const
scalar_t
>
(
outFeatures
),
tv
::
torch2tv
<
const
scalar_t
>
(
features
),
tv
::
torch2tv
<
const
scalar_t
>
(
outGrad
),
tv
::
torch2tv
<
scalar_t
>
(
inputGrad
),
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
),
nHot
);
}
else
{
functor
::
SparseMaxPoolBackwardFunctor
<
tv
::
TorchGPU
,
scalar_t
,
int
>
backwardFtor
;
backwardFtor
(
tv
::
TorchGPU
(),
tv
::
torch2tv
<
const
scalar_t
>
(
outFeatures
),
tv
::
torch2tv
<
const
scalar_t
>
(
features
),
tv
::
torch2tv
<
const
scalar_t
>
(
outGrad
),
tv
::
torch2tv
<
scalar_t
>
(
inputGrad
),
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
),
nHot
);
TV_CHECK_CUDA_ERR
();
}
});
}
return
inputGrad
;
}
Prev
1
…
14
15
16
17
18
19
20
21
22
23
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment