Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
MMCV
Commits
fdeee889
Commit
fdeee889
authored
May 25, 2025
by
limm
Browse files
release v1.6.1 of mmcv
parent
df465820
Changes
457
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1469 additions
and
334 deletions
+1469
-334
mmcv/ops/csrc/pytorch/corner_pool.cpp
mmcv/ops/csrc/pytorch/corner_pool.cpp
+0
-240
mmcv/ops/csrc/pytorch/cpu/active_rotated_filter.cpp
mmcv/ops/csrc/pytorch/cpu/active_rotated_filter.cpp
+120
-0
mmcv/ops/csrc/pytorch/cpu/nms_rotated.cpp
mmcv/ops/csrc/pytorch/cpu/nms_rotated.cpp
+1
-1
mmcv/ops/csrc/pytorch/cpu/pixel_group.cpp
mmcv/ops/csrc/pytorch/cpu/pixel_group.cpp
+4
-2
mmcv/ops/csrc/pytorch/cpu/roi_align_rotated.cpp
mmcv/ops/csrc/pytorch/cpu/roi_align_rotated.cpp
+3
-6
mmcv/ops/csrc/pytorch/cpu/rotated_feature_align.cpp
mmcv/ops/csrc/pytorch/cpu/rotated_feature_align.cpp
+262
-0
mmcv/ops/csrc/pytorch/cpu/sparse_indice.cpp
mmcv/ops/csrc/pytorch/cpu/sparse_indice.cpp
+84
-0
mmcv/ops/csrc/pytorch/cpu/sparse_maxpool.cpp
mmcv/ops/csrc/pytorch/cpu/sparse_maxpool.cpp
+82
-0
mmcv/ops/csrc/pytorch/cpu/sparse_reordering.cpp
mmcv/ops/csrc/pytorch/cpu/sparse_reordering.cpp
+68
-0
mmcv/ops/csrc/pytorch/cpu/voxelization.cpp
mmcv/ops/csrc/pytorch/cpu/voxelization.cpp
+23
-7
mmcv/ops/csrc/pytorch/cuda/active_rotated_filter_cuda.cu
mmcv/ops/csrc/pytorch/cuda/active_rotated_filter_cuda.cu
+58
-0
mmcv/ops/csrc/pytorch/cuda/assign_score_withk_cuda.cu
mmcv/ops/csrc/pytorch/cuda/assign_score_withk_cuda.cu
+3
-3
mmcv/ops/csrc/pytorch/cuda/ball_query_cuda.cu
mmcv/ops/csrc/pytorch/cuda/ball_query_cuda.cu
+1
-1
mmcv/ops/csrc/pytorch/cuda/bbox_overlaps_cuda.cu
mmcv/ops/csrc/pytorch/cuda/bbox_overlaps_cuda.cu
+16
-0
mmcv/ops/csrc/pytorch/cuda/chamfer_distance_cuda.cu
mmcv/ops/csrc/pytorch/cuda/chamfer_distance_cuda.cu
+63
-0
mmcv/ops/csrc/pytorch/cuda/convex_iou.cu
mmcv/ops/csrc/pytorch/cuda/convex_iou.cu
+41
-0
mmcv/ops/csrc/pytorch/cuda/correlation_cuda.cu
mmcv/ops/csrc/pytorch/cuda/correlation_cuda.cu
+21
-20
mmcv/ops/csrc/pytorch/cuda/cudabind.cpp
mmcv/ops/csrc/pytorch/cuda/cudabind.cpp
+480
-54
mmcv/ops/csrc/pytorch/cuda/diff_iou_rotated_cuda.cu
mmcv/ops/csrc/pytorch/cuda/diff_iou_rotated_cuda.cu
+35
-0
mmcv/ops/csrc/pytorch/cuda/fused_spconv_ops_cuda.cu
mmcv/ops/csrc/pytorch/cuda/fused_spconv_ops_cuda.cu
+104
-0
No files found.
Too many changes to show.
To preserve performance only
457 of 457+
files are displayed.
Plain diff
Email patch
mmcv/ops/csrc/pytorch/corner_pool.cpp
deleted
100644 → 0
View file @
df465820
// Copyright (c) OpenMMLab. All rights reserved
// Modified from
// https://github.com/princeton-vl/CornerNet-Lite/tree/master/core/models/py_utils/_cpools/src
#include "pytorch_cpp_helper.hpp"
Tensor
bottom_pool_forward
(
Tensor
input
)
{
// Initialize output
Tensor
output
=
at
::
zeros_like
(
input
);
// Get height
int64_t
height
=
input
.
size
(
2
);
output
.
copy_
(
input
);
for
(
int64_t
ind
=
1
;
ind
<
height
;
ind
<<=
1
)
{
Tensor
max_temp
=
at
::
slice
(
output
,
2
,
ind
,
height
);
Tensor
cur_temp
=
at
::
slice
(
output
,
2
,
ind
,
height
).
clone
();
Tensor
next_temp
=
at
::
slice
(
output
,
2
,
0
,
height
-
ind
).
clone
();
at
::
max_out
(
max_temp
,
cur_temp
,
next_temp
);
}
return
output
;
}
Tensor
bottom_pool_backward
(
Tensor
input
,
Tensor
grad_output
)
{
auto
output
=
at
::
zeros_like
(
input
);
int32_t
batch
=
input
.
size
(
0
);
int32_t
channel
=
input
.
size
(
1
);
int32_t
height
=
input
.
size
(
2
);
int32_t
width
=
input
.
size
(
3
);
auto
max_val
=
torch
::
zeros
({
batch
,
channel
,
width
},
at
::
device
(
at
::
kCUDA
).
dtype
(
at
::
kFloat
));
auto
max_ind
=
torch
::
zeros
({
batch
,
channel
,
width
},
at
::
device
(
at
::
kCUDA
).
dtype
(
at
::
kLong
));
auto
input_temp
=
input
.
select
(
2
,
0
);
max_val
.
copy_
(
input_temp
);
max_ind
.
fill_
(
0
);
auto
output_temp
=
output
.
select
(
2
,
0
);
auto
grad_output_temp
=
grad_output
.
select
(
2
,
0
);
output_temp
.
copy_
(
grad_output_temp
);
auto
un_max_ind
=
max_ind
.
unsqueeze
(
2
);
auto
gt_mask
=
torch
::
zeros
({
batch
,
channel
,
width
},
at
::
device
(
at
::
kCUDA
).
dtype
(
at
::
kBool
));
auto
max_temp
=
torch
::
zeros
({
batch
,
channel
,
width
},
at
::
device
(
at
::
kCUDA
).
dtype
(
at
::
kFloat
));
for
(
int32_t
ind
=
0
;
ind
<
height
-
1
;
++
ind
)
{
input_temp
=
input
.
select
(
2
,
ind
+
1
);
at
::
gt_out
(
gt_mask
,
input_temp
,
max_val
);
at
::
masked_select_out
(
max_temp
,
input_temp
,
gt_mask
);
max_val
.
masked_scatter_
(
gt_mask
,
max_temp
);
max_ind
.
masked_fill_
(
gt_mask
,
ind
+
1
);
grad_output_temp
=
grad_output
.
select
(
2
,
ind
+
1
).
unsqueeze
(
2
);
output
.
scatter_add_
(
2
,
un_max_ind
,
grad_output_temp
);
}
return
output
;
}
Tensor
left_pool_forward
(
Tensor
input
)
{
// Initialize output
Tensor
output
=
at
::
zeros_like
(
input
);
// Get width
int64_t
width
=
input
.
size
(
3
);
output
.
copy_
(
input
);
for
(
int64_t
ind
=
1
;
ind
<
width
;
ind
<<=
1
)
{
Tensor
max_temp
=
at
::
slice
(
output
,
3
,
0
,
width
-
ind
);
Tensor
cur_temp
=
at
::
slice
(
output
,
3
,
0
,
width
-
ind
).
clone
();
Tensor
next_temp
=
at
::
slice
(
output
,
3
,
ind
,
width
).
clone
();
at
::
max_out
(
max_temp
,
cur_temp
,
next_temp
);
}
return
output
;
}
Tensor
left_pool_backward
(
Tensor
input
,
Tensor
grad_output
)
{
auto
output
=
at
::
zeros_like
(
input
);
int32_t
batch
=
input
.
size
(
0
);
int32_t
channel
=
input
.
size
(
1
);
int32_t
height
=
input
.
size
(
2
);
int32_t
width
=
input
.
size
(
3
);
auto
max_val
=
torch
::
zeros
({
batch
,
channel
,
height
},
at
::
device
(
at
::
kCUDA
).
dtype
(
at
::
kFloat
));
auto
max_ind
=
torch
::
zeros
({
batch
,
channel
,
height
},
at
::
device
(
at
::
kCUDA
).
dtype
(
at
::
kLong
));
auto
input_temp
=
input
.
select
(
3
,
width
-
1
);
max_val
.
copy_
(
input_temp
);
max_ind
.
fill_
(
width
-
1
);
auto
output_temp
=
output
.
select
(
3
,
width
-
1
);
auto
grad_output_temp
=
grad_output
.
select
(
3
,
width
-
1
);
output_temp
.
copy_
(
grad_output_temp
);
auto
un_max_ind
=
max_ind
.
unsqueeze
(
3
);
auto
gt_mask
=
torch
::
zeros
({
batch
,
channel
,
height
},
at
::
device
(
at
::
kCUDA
).
dtype
(
at
::
kBool
));
auto
max_temp
=
torch
::
zeros
({
batch
,
channel
,
height
},
at
::
device
(
at
::
kCUDA
).
dtype
(
at
::
kFloat
));
for
(
int32_t
ind
=
1
;
ind
<
width
;
++
ind
)
{
input_temp
=
input
.
select
(
3
,
width
-
ind
-
1
);
at
::
gt_out
(
gt_mask
,
input_temp
,
max_val
);
at
::
masked_select_out
(
max_temp
,
input_temp
,
gt_mask
);
max_val
.
masked_scatter_
(
gt_mask
,
max_temp
);
max_ind
.
masked_fill_
(
gt_mask
,
width
-
ind
-
1
);
grad_output_temp
=
grad_output
.
select
(
3
,
width
-
ind
-
1
).
unsqueeze
(
3
);
output
.
scatter_add_
(
3
,
un_max_ind
,
grad_output_temp
);
}
return
output
;
}
Tensor
right_pool_forward
(
Tensor
input
)
{
// Initialize output
Tensor
output
=
at
::
zeros_like
(
input
);
// Get width
int64_t
width
=
input
.
size
(
3
);
output
.
copy_
(
input
);
for
(
int64_t
ind
=
1
;
ind
<
width
;
ind
<<=
1
)
{
Tensor
max_temp
=
at
::
slice
(
output
,
3
,
ind
,
width
);
Tensor
cur_temp
=
at
::
slice
(
output
,
3
,
ind
,
width
).
clone
();
Tensor
next_temp
=
at
::
slice
(
output
,
3
,
0
,
width
-
ind
).
clone
();
at
::
max_out
(
max_temp
,
cur_temp
,
next_temp
);
}
return
output
;
}
Tensor
right_pool_backward
(
Tensor
input
,
Tensor
grad_output
)
{
Tensor
output
=
at
::
zeros_like
(
input
);
int32_t
batch
=
input
.
size
(
0
);
int32_t
channel
=
input
.
size
(
1
);
int32_t
height
=
input
.
size
(
2
);
int32_t
width
=
input
.
size
(
3
);
auto
max_val
=
torch
::
zeros
({
batch
,
channel
,
height
},
at
::
device
(
at
::
kCUDA
).
dtype
(
at
::
kFloat
));
auto
max_ind
=
torch
::
zeros
({
batch
,
channel
,
height
},
at
::
device
(
at
::
kCUDA
).
dtype
(
at
::
kLong
));
auto
input_temp
=
input
.
select
(
3
,
0
);
max_val
.
copy_
(
input_temp
);
max_ind
.
fill_
(
0
);
auto
output_temp
=
output
.
select
(
3
,
0
);
auto
grad_output_temp
=
grad_output
.
select
(
3
,
0
);
output_temp
.
copy_
(
grad_output_temp
);
auto
un_max_ind
=
max_ind
.
unsqueeze
(
3
);
auto
gt_mask
=
torch
::
zeros
({
batch
,
channel
,
height
},
at
::
device
(
at
::
kCUDA
).
dtype
(
at
::
kBool
));
auto
max_temp
=
torch
::
zeros
({
batch
,
channel
,
height
},
at
::
device
(
at
::
kCUDA
).
dtype
(
at
::
kFloat
));
for
(
int32_t
ind
=
0
;
ind
<
width
-
1
;
++
ind
)
{
input_temp
=
input
.
select
(
3
,
ind
+
1
);
at
::
gt_out
(
gt_mask
,
input_temp
,
max_val
);
at
::
masked_select_out
(
max_temp
,
input_temp
,
gt_mask
);
max_val
.
masked_scatter_
(
gt_mask
,
max_temp
);
max_ind
.
masked_fill_
(
gt_mask
,
ind
+
1
);
grad_output_temp
=
grad_output
.
select
(
3
,
ind
+
1
).
unsqueeze
(
3
);
output
.
scatter_add_
(
3
,
un_max_ind
,
grad_output_temp
);
}
return
output
;
}
Tensor
top_pool_forward
(
Tensor
input
)
{
// Initialize output
Tensor
output
=
at
::
zeros_like
(
input
);
// Get height
int64_t
height
=
input
.
size
(
2
);
output
.
copy_
(
input
);
for
(
int64_t
ind
=
1
;
ind
<
height
;
ind
<<=
1
)
{
Tensor
max_temp
=
at
::
slice
(
output
,
2
,
0
,
height
-
ind
);
Tensor
cur_temp
=
at
::
slice
(
output
,
2
,
0
,
height
-
ind
).
clone
();
Tensor
next_temp
=
at
::
slice
(
output
,
2
,
ind
,
height
).
clone
();
at
::
max_out
(
max_temp
,
cur_temp
,
next_temp
);
}
return
output
;
}
Tensor
top_pool_backward
(
Tensor
input
,
Tensor
grad_output
)
{
auto
output
=
at
::
zeros_like
(
input
);
int32_t
batch
=
input
.
size
(
0
);
int32_t
channel
=
input
.
size
(
1
);
int32_t
height
=
input
.
size
(
2
);
int32_t
width
=
input
.
size
(
3
);
auto
max_val
=
torch
::
zeros
({
batch
,
channel
,
width
},
at
::
device
(
at
::
kCUDA
).
dtype
(
at
::
kFloat
));
auto
max_ind
=
torch
::
zeros
({
batch
,
channel
,
width
},
at
::
device
(
at
::
kCUDA
).
dtype
(
at
::
kLong
));
auto
input_temp
=
input
.
select
(
2
,
height
-
1
);
max_val
.
copy_
(
input_temp
);
max_ind
.
fill_
(
height
-
1
);
auto
output_temp
=
output
.
select
(
2
,
height
-
1
);
auto
grad_output_temp
=
grad_output
.
select
(
2
,
height
-
1
);
output_temp
.
copy_
(
grad_output_temp
);
auto
un_max_ind
=
max_ind
.
unsqueeze
(
2
);
auto
gt_mask
=
torch
::
zeros
({
batch
,
channel
,
width
},
at
::
device
(
at
::
kCUDA
).
dtype
(
at
::
kBool
));
auto
max_temp
=
torch
::
zeros
({
batch
,
channel
,
width
},
at
::
device
(
at
::
kCUDA
).
dtype
(
at
::
kFloat
));
for
(
int32_t
ind
=
1
;
ind
<
height
;
++
ind
)
{
input_temp
=
input
.
select
(
2
,
height
-
ind
-
1
);
at
::
gt_out
(
gt_mask
,
input_temp
,
max_val
);
at
::
masked_select_out
(
max_temp
,
input_temp
,
gt_mask
);
max_val
.
masked_scatter_
(
gt_mask
,
max_temp
);
max_ind
.
masked_fill_
(
gt_mask
,
height
-
ind
-
1
);
grad_output_temp
=
grad_output
.
select
(
2
,
height
-
ind
-
1
).
unsqueeze
(
2
);
output
.
scatter_add_
(
2
,
un_max_ind
,
grad_output_temp
);
}
return
output
;
}
mmcv/ops/csrc/pytorch/cpu/active_rotated_filter.cpp
0 → 100644
View file @
fdeee889
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
// modified from
// https://github.com/csuhan/s2anet/blob/master/mmdet/ops/orn/src/cpu/ActiveRotatingFilter_cpu.cpp
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
template
<
typename
T
>
void
active_rotated_filter_forward_cpu_kernel
(
const
T
*
weightData
,
const
int
*
indicesData
,
const
int
num_output_planes
,
const
int
num_input_planes
,
const
int
num_orientations
,
const
int
kH
,
const
int
kW
,
const
int
num_rotations
,
T
*
outputData
)
{
const
int
nEntry
=
num_orientations
*
kH
*
kW
;
int
i
,
j
,
l
;
int
k
;
#pragma omp parallel for private(i, j, l, k)
for
(
i
=
0
;
i
<
num_output_planes
;
i
++
)
{
for
(
j
=
0
;
j
<
num_input_planes
;
j
++
)
{
for
(
l
=
0
;
l
<
nEntry
;
l
++
)
{
int
weightIndex
=
i
*
num_input_planes
*
nEntry
+
j
*
nEntry
+
l
;
T
val
=
*
(
weightData
+
weightIndex
);
for
(
k
=
0
;
k
<
num_rotations
;
k
++
)
{
int
index
=
(
int
)(
*
(
indicesData
+
l
*
num_rotations
+
k
))
-
1
;
T
*
target
=
outputData
+
i
*
(
num_rotations
*
num_input_planes
*
nEntry
)
+
k
*
(
num_input_planes
*
nEntry
)
+
j
*
(
nEntry
)
+
index
;
*
target
=
val
;
}
}
}
}
}
template
<
typename
T
>
void
active_rotated_filter_backward_cpu_kernel
(
const
T
*
gradOutputData
,
const
int
*
indicesData
,
const
int
num_output_planes
,
const
int
num_input_planes
,
const
int
num_orientations
,
const
int
kH
,
const
int
kW
,
const
int
num_rotations
,
T
*
gradInputData
)
{
const
int
nEntry
=
num_orientations
*
kH
*
kW
;
int
i
,
j
,
l
;
int
k
;
#pragma omp parallel for private(i, j, l, k)
for
(
i
=
0
;
i
<
num_output_planes
;
i
++
)
{
for
(
j
=
0
;
j
<
num_input_planes
;
j
++
)
{
for
(
l
=
0
;
l
<
nEntry
;
l
++
)
{
int
gradInputIndex
=
i
*
num_input_planes
*
nEntry
+
j
*
nEntry
+
l
;
T
*
val
=
gradInputData
+
gradInputIndex
;
*
val
=
0
;
for
(
k
=
0
;
k
<
num_rotations
;
k
++
)
{
int
index
=
(
int
)(
*
(
indicesData
+
l
*
num_rotations
+
k
))
-
1
;
const
T
*
target
=
gradOutputData
+
i
*
(
num_rotations
*
num_input_planes
*
nEntry
)
+
k
*
(
num_input_planes
*
nEntry
)
+
j
*
(
nEntry
)
+
index
;
*
val
=
*
val
+
*
target
;
}
}
}
}
}
void
ActiveRotatedFilterForwardCPULauncher
(
const
Tensor
input
,
const
Tensor
indices
,
Tensor
output
)
{
const
int
num_output_planes
=
input
.
size
(
0
);
const
int
num_input_planes
=
input
.
size
(
1
);
const
int
num_orientations
=
input
.
size
(
2
);
const
int
kH
=
input
.
size
(
3
);
const
int
kW
=
input
.
size
(
4
);
const
int
num_rotations
=
indices
.
size
(
3
);
AT_DISPATCH_FLOATING_TYPES_AND_HALF
(
input
.
scalar_type
(),
"active_rotated_filter_forward_cpu_kernel"
,
[
&
]
{
active_rotated_filter_forward_cpu_kernel
<
scalar_t
>
(
input
.
data_ptr
<
scalar_t
>
(),
indices
.
data_ptr
<
int
>
(),
num_output_planes
,
num_input_planes
,
num_orientations
,
kH
,
kW
,
num_rotations
,
output
.
data_ptr
<
scalar_t
>
());
});
}
void
ActiveRotatedFilterBackwardCPULauncher
(
const
Tensor
grad_out
,
const
Tensor
indices
,
Tensor
grad_in
)
{
const
int
num_orientations
=
indices
.
size
(
0
);
const
int
kH
=
indices
.
size
(
1
);
const
int
kW
=
indices
.
size
(
2
);
const
int
num_rotations
=
indices
.
size
(
3
);
const
int
num_output_planes
=
grad_out
.
size
(
0
)
/
num_rotations
;
const
int
num_input_planes
=
grad_out
.
size
(
1
)
/
num_orientations
;
AT_DISPATCH_FLOATING_TYPES_AND_HALF
(
grad_out
.
scalar_type
(),
"active_rotated_filter_backward_cpu_kernel"
,
[
&
]
{
active_rotated_filter_backward_cpu_kernel
<
scalar_t
>
(
grad_out
.
data_ptr
<
scalar_t
>
(),
indices
.
data_ptr
<
int
>
(),
num_output_planes
,
num_input_planes
,
num_orientations
,
kH
,
kW
,
num_rotations
,
grad_in
.
data_ptr
<
scalar_t
>
());
});
}
void
active_rotated_filter_forward_cpu
(
const
Tensor
input
,
const
Tensor
indices
,
Tensor
output
)
{
ActiveRotatedFilterForwardCPULauncher
(
input
,
indices
,
output
);
}
void
active_rotated_filter_backward_cpu
(
const
Tensor
grad_out
,
const
Tensor
indices
,
Tensor
grad_in
)
{
ActiveRotatedFilterBackwardCPULauncher
(
grad_out
,
indices
,
grad_in
);
}
void
active_rotated_filter_forward_impl
(
const
Tensor
input
,
const
Tensor
indices
,
Tensor
output
);
void
active_rotated_filter_backward_impl
(
const
Tensor
grad_out
,
const
Tensor
indices
,
Tensor
grad_in
);
REGISTER_DEVICE_IMPL
(
active_rotated_filter_forward_impl
,
CPU
,
active_rotated_filter_forward_cpu
);
REGISTER_DEVICE_IMPL
(
active_rotated_filter_backward_impl
,
CPU
,
active_rotated_filter_backward_cpu
);
mmcv/ops/csrc/pytorch/cpu/nms_rotated.cpp
View file @
fdeee889
...
@@ -59,7 +59,7 @@ Tensor nms_rotated_cpu_kernel(const Tensor dets, const Tensor scores,
...
@@ -59,7 +59,7 @@ Tensor nms_rotated_cpu_kernel(const Tensor dets, const Tensor scores,
Tensor
nms_rotated_cpu
(
const
Tensor
dets
,
const
Tensor
scores
,
Tensor
nms_rotated_cpu
(
const
Tensor
dets
,
const
Tensor
scores
,
const
float
iou_threshold
)
{
const
float
iou_threshold
)
{
auto
result
=
at
::
empty
({
0
},
dets
.
options
());
auto
result
=
at
::
empty
({
0
},
dets
.
options
());
AT_DISPATCH_FLOATING_TYPES
(
dets
.
type
(),
"nms_rotated"
,
[
&
]
{
AT_DISPATCH_FLOATING_TYPES
(
dets
.
scalar_
type
(),
"nms_rotated"
,
[
&
]
{
result
=
nms_rotated_cpu_kernel
<
scalar_t
>
(
dets
,
scores
,
iou_threshold
);
result
=
nms_rotated_cpu_kernel
<
scalar_t
>
(
dets
,
scores
,
iou_threshold
);
});
});
return
result
;
return
result
;
...
...
mmcv/ops/csrc/pytorch/cpu/pixel_group.cpp
View file @
fdeee889
// Copyright (c) OpenMMLab. All rights reserved
// Copyright (c) OpenMMLab. All rights reserved
// It is modified from https://github.com/WenmuZhou/PAN.pytorch
// It is modified from https://github.com/WenmuZhou/PAN.pytorch
#include <queue>
#include "pytorch_cpp_helper.hpp"
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#include "pytorch_device_registry.hpp"
...
@@ -39,7 +41,7 @@ std::vector<std::vector<float>> pixel_group_cpu(
...
@@ -39,7 +41,7 @@ std::vector<std::vector<float>> pixel_group_cpu(
Tensor
kernel_contour
,
int
kernel_region_num
,
float
dis_threshold
)
{
Tensor
kernel_contour
,
int
kernel_region_num
,
float
dis_threshold
)
{
assert
(
score
.
dim
()
==
2
);
assert
(
score
.
dim
()
==
2
);
assert
(
mask
.
dim
()
==
2
);
assert
(
mask
.
dim
()
==
2
);
assert
(
embedding
_dim
.
dim
()
==
3
);
assert
(
embedding
.
dim
()
==
3
);
int
height
=
score
.
size
(
0
);
int
height
=
score
.
size
(
0
);
int
width
=
score
.
size
(
1
);
int
width
=
score
.
size
(
1
);
assert
(
height
==
mask
.
size
(
0
)
==
embedding
.
size
(
1
)
==
kernel_label
.
size
(
1
));
assert
(
height
==
mask
.
size
(
0
)
==
embedding
.
size
(
1
)
==
kernel_label
.
size
(
1
));
...
@@ -103,7 +105,7 @@ std::vector<std::vector<float>> pixel_group_cpu(
...
@@ -103,7 +105,7 @@ std::vector<std::vector<float>> pixel_group_cpu(
float
dis
=
0
;
float
dis
=
0
;
auto
ptr_embedding_tmp
=
ptr_embedding
+
tmpy
*
width
*
embedding_dim
;
auto
ptr_embedding_tmp
=
ptr_embedding
+
tmpy
*
width
*
embedding_dim
;
for
(
size_t
i
=
0
;
i
<
embedding_dim
;
i
++
)
{
for
(
size_t
i
=
0
;
i
<
size_t
(
embedding_dim
)
;
i
++
)
{
dis
+=
dis
+=
pow
(
kernel_cv
[
i
]
-
ptr_embedding_tmp
[
tmpx
*
embedding_dim
+
i
],
2
);
pow
(
kernel_cv
[
i
]
-
ptr_embedding_tmp
[
tmpx
*
embedding_dim
+
i
],
2
);
// ignore further computing if dis is big enough
// ignore further computing if dis is big enough
...
...
mmcv/ops/csrc/pytorch/cpu/roi_align_rotated.cpp
View file @
fdeee889
...
@@ -395,7 +395,6 @@ void ROIAlignRotatedBackwardCPULauncher(Tensor grad_output, Tensor rois,
...
@@ -395,7 +395,6 @@ void ROIAlignRotatedBackwardCPULauncher(Tensor grad_output, Tensor rois,
int
aligned_width
,
float
spatial_scale
,
int
aligned_width
,
float
spatial_scale
,
int
sampling_ratio
,
bool
aligned
,
int
sampling_ratio
,
bool
aligned
,
bool
clockwise
)
{
bool
clockwise
)
{
int
output_size
=
grad_output
.
numel
();
int
channels
=
grad_input
.
size
(
1
);
int
channels
=
grad_input
.
size
(
1
);
int
height
=
grad_input
.
size
(
2
);
int
height
=
grad_input
.
size
(
2
);
int
width
=
grad_input
.
size
(
3
);
int
width
=
grad_input
.
size
(
3
);
...
@@ -431,8 +430,6 @@ void roi_align_rotated_backward_cpu(Tensor top_grad, Tensor rois,
...
@@ -431,8 +430,6 @@ void roi_align_rotated_backward_cpu(Tensor top_grad, Tensor rois,
int
aligned_width
,
float
spatial_scale
,
int
aligned_width
,
float
spatial_scale
,
int
sampling_ratio
,
bool
aligned
,
int
sampling_ratio
,
bool
aligned
,
bool
clockwise
)
{
bool
clockwise
)
{
// Number of ROIs
int
num_rois
=
rois
.
size
(
0
);
int
size_rois
=
rois
.
size
(
1
);
int
size_rois
=
rois
.
size
(
1
);
if
(
size_rois
!=
6
)
{
if
(
size_rois
!=
6
)
{
AT_ERROR
(
"wrong roi size"
);
AT_ERROR
(
"wrong roi size"
);
...
@@ -442,15 +439,15 @@ void roi_align_rotated_backward_cpu(Tensor top_grad, Tensor rois,
...
@@ -442,15 +439,15 @@ void roi_align_rotated_backward_cpu(Tensor top_grad, Tensor rois,
sampling_ratio
,
aligned
,
clockwise
);
sampling_ratio
,
aligned
,
clockwise
);
}
}
void
roi_align_rotated_forward_impl
(
Tensor
features
,
Tensor
rois
,
Tensor
output
,
void
roi_align_rotated_forward_impl
(
Tensor
input
,
Tensor
rois
,
Tensor
output
,
int
aligned_height
,
int
aligned_width
,
int
aligned_height
,
int
aligned_width
,
float
spatial_scale
,
int
sampl
e
_ratio
,
float
spatial_scale
,
int
sampl
ing
_ratio
,
bool
aligned
,
bool
clockwise
);
bool
aligned
,
bool
clockwise
);
void
roi_align_rotated_backward_impl
(
Tensor
top_grad
,
Tensor
rois
,
void
roi_align_rotated_backward_impl
(
Tensor
top_grad
,
Tensor
rois
,
Tensor
bottom_grad
,
int
aligned_height
,
Tensor
bottom_grad
,
int
aligned_height
,
int
aligned_width
,
float
spatial_scale
,
int
aligned_width
,
float
spatial_scale
,
int
sampl
e
_ratio
,
bool
aligned
,
int
sampl
ing
_ratio
,
bool
aligned
,
bool
clockwise
);
bool
clockwise
);
REGISTER_DEVICE_IMPL
(
roi_align_rotated_forward_impl
,
CPU
,
REGISTER_DEVICE_IMPL
(
roi_align_rotated_forward_impl
,
CPU
,
roi_align_rotated_forward_cpu
);
roi_align_rotated_forward_cpu
);
...
...
mmcv/ops/csrc/pytorch/cpu/rotated_feature_align.cpp
0 → 100644
View file @
fdeee889
// modified from
// https://github.com/SJTU-Thinklab-Det/r3det-on-mmdetection/blob/master/mmdet/ops/fr/src/feature_refine_kernel.cu
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
template
<
typename
T
>
T
bilinear_interpolate
(
const
T
*
input
,
const
int
height
,
const
int
width
,
T
y
,
T
x
,
const
int
index
/* index for debug only*/
)
{
// deal with cases that inverse elements are out of feature map boundary
if
(
y
<
-
1.0
||
y
>
height
||
x
<
-
1.0
||
x
>
width
)
return
0
;
if
(
y
<=
0
)
y
=
0
;
if
(
x
<=
0
)
x
=
0
;
int
y_low
=
(
int
)
y
;
int
x_low
=
(
int
)
x
;
int
y_high
;
int
x_high
;
if
(
y_low
>=
height
-
1
)
{
y_high
=
y_low
=
height
-
1
;
y
=
(
T
)
y_low
;
}
else
{
y_high
=
y_low
+
1
;
}
if
(
x_low
>=
width
-
1
)
{
x_high
=
x_low
=
width
-
1
;
x
=
(
T
)
x_low
;
}
else
{
x_high
=
x_low
+
1
;
}
T
ly
=
y
-
y_low
;
T
lx
=
x
-
x_low
;
// do bilinear interpolation
T
v1
=
input
[
y_low
*
width
+
x_low
];
T
v2
=
input
[
y_low
*
width
+
x_high
];
T
v3
=
input
[
y_high
*
width
+
x_low
];
T
v4
=
input
[
y_high
*
width
+
x_high
];
const
T
v_low
=
fma
(
v2
-
v1
,
lx
,
v1
);
const
T
v_high
=
fma
(
v4
-
v3
,
lx
,
v3
);
const
T
val
=
fma
(
v_high
-
v_low
,
ly
,
v_low
);
return
val
;
}
template
<
typename
scalar_t
>
void
rotated_feature_align_forward_cpu_kernel
(
const
int
nthreads
,
const
int
points
,
const
scalar_t
*
bottom_data
,
const
scalar_t
*
best_bboxes
,
const
scalar_t
spatial_scale
,
const
int
channels
,
const
int
height
,
const
int
width
,
scalar_t
*
top_data
)
{
for
(
int
index
=
0
;
index
<
nthreads
;
index
++
)
{
int
w
=
index
%
width
;
int
h
=
(
index
/
width
)
%
height
;
int
c
=
(
index
/
width
/
height
)
%
channels
;
int
n
=
index
/
width
/
height
/
channels
;
const
scalar_t
*
bbox_offset
=
best_bboxes
+
((
n
*
height
+
h
)
*
width
+
w
)
*
5
;
scalar_t
roi_y
=
bbox_offset
[
0
]
*
spatial_scale
;
scalar_t
roi_x
=
bbox_offset
[
1
]
*
spatial_scale
;
scalar_t
px
[
5
]
=
{
roi_x
,
0
,
0
,
0
,
0
};
scalar_t
py
[
5
]
=
{
roi_y
,
0
,
0
,
0
,
0
};
if
(
points
>
1
)
{
scalar_t
roi_w
=
bbox_offset
[
2
]
*
spatial_scale
;
scalar_t
roi_h
=
bbox_offset
[
3
]
*
spatial_scale
;
scalar_t
roi_a
=
bbox_offset
[
4
];
scalar_t
w_2
=
roi_w
/
2
,
h_2
=
roi_h
/
2
;
scalar_t
cosa
=
cosf
(
roi_a
),
sina
=
sinf
(
roi_a
);
scalar_t
wx
=
cosa
*
w_2
,
wy
=
sina
*
w_2
;
scalar_t
hx
=
-
sina
*
h_2
,
hy
=
cosa
*
h_2
;
px
[
1
]
=
roi_x
+
wx
+
hx
;
py
[
1
]
=
roi_y
+
wy
+
hy
;
px
[
2
]
=
roi_x
-
wx
+
hx
;
py
[
2
]
=
roi_y
-
wy
+
hy
;
px
[
3
]
=
roi_x
-
wx
-
hx
;
py
[
3
]
=
roi_y
-
wy
-
hy
;
px
[
4
]
=
roi_x
+
wx
-
hx
;
py
[
4
]
=
roi_y
+
wy
-
hy
;
}
const
scalar_t
*
offset_bottom_data
=
bottom_data
+
(
n
*
channels
+
c
)
*
height
*
width
;
scalar_t
output_val
=
bottom_data
[
index
];
for
(
int
i
=
0
;
i
<
points
;
i
++
)
{
output_val
+=
bilinear_interpolate
<
scalar_t
>
(
offset_bottom_data
,
height
,
width
,
py
[
i
],
px
[
i
],
i
);
}
top_data
[
index
]
=
output_val
;
}
}
template
<
typename
T
>
void
bilinear_interpolate_gradient
(
const
int
height
,
const
int
width
,
T
y
,
T
x
,
T
&
w1
,
T
&
w2
,
T
&
w3
,
T
&
w4
,
int
&
x_low
,
int
&
x_high
,
int
&
y_low
,
int
&
y_high
,
const
int
index
)
{
// deal with cases that inverse elements are out of feature map boundary
if
(
y
<
-
1.0
||
y
>
height
||
x
<
-
1.0
||
x
>
width
)
{
// empty
w1
=
w2
=
w3
=
w4
=
0.
;
x_low
=
x_high
=
y_low
=
y_high
=
-
1
;
return
;
}
if
(
y
<=
0
)
y
=
0
;
if
(
x
<=
0
)
x
=
0
;
y_low
=
(
int
)
y
;
x_low
=
(
int
)
x
;
if
(
y_low
>=
height
-
1
)
{
y_high
=
y_low
=
height
-
1
;
y
=
(
T
)
y_low
;
}
else
{
y_high
=
y_low
+
1
;
}
if
(
x_low
>=
width
-
1
)
{
x_high
=
x_low
=
width
-
1
;
x
=
(
T
)
x_low
;
}
else
{
x_high
=
x_low
+
1
;
}
T
ly
=
y
-
y_low
;
T
lx
=
x
-
x_low
;
T
hy
=
1.
-
ly
,
hx
=
1.
-
lx
;
w1
=
hy
*
hx
,
w2
=
hy
*
lx
,
w3
=
ly
*
hx
,
w4
=
ly
*
lx
;
return
;
}
template
<
typename
scalar_t
>
inline
void
valueAdd
(
scalar_t
*
address
,
scalar_t
val
)
{
scalar_t
old
=
*
address
;
*
address
=
(
old
+
val
);
}
template
<
typename
scalar_t
>
void
rotated_feature_align_backward_cpu_kernel
(
const
int
nthreads
,
const
int
points
,
const
scalar_t
*
top_diff
,
const
scalar_t
*
best_bboxes
,
const
scalar_t
spatial_scale
,
const
int
channels
,
const
int
height
,
const
int
width
,
scalar_t
*
bottom_diff
)
{
for
(
int
index
=
0
;
index
<
nthreads
;
index
++
)
{
int
w
=
index
%
width
;
int
h
=
(
index
/
width
)
%
height
;
int
c
=
(
index
/
width
/
height
)
%
channels
;
int
n
=
index
/
width
/
height
/
channels
;
const
scalar_t
*
bbox_offset
=
best_bboxes
+
((
n
*
height
+
h
)
*
width
+
w
)
*
5
;
scalar_t
roi_y
=
bbox_offset
[
0
]
*
spatial_scale
;
scalar_t
roi_x
=
bbox_offset
[
1
]
*
spatial_scale
;
scalar_t
px
[
5
]
=
{
roi_x
,
0
,
0
,
0
,
0
};
scalar_t
py
[
5
]
=
{
roi_y
,
0
,
0
,
0
,
0
};
if
(
points
>
1
)
{
scalar_t
roi_w
=
bbox_offset
[
2
]
*
spatial_scale
;
scalar_t
roi_h
=
bbox_offset
[
3
]
*
spatial_scale
;
scalar_t
roi_a
=
bbox_offset
[
4
];
scalar_t
w_2
=
roi_w
/
2
,
h_2
=
roi_h
/
2
;
scalar_t
cosa
=
cosf
(
roi_a
),
sina
=
sinf
(
roi_a
);
scalar_t
wx
=
cosa
*
w_2
,
wy
=
sina
*
w_2
;
scalar_t
hx
=
-
sina
*
h_2
,
hy
=
cosa
*
h_2
;
px
[
1
]
=
roi_x
+
wx
+
hx
;
py
[
1
]
=
roi_y
+
wy
+
hy
;
px
[
2
]
=
roi_x
-
wx
+
hx
;
py
[
2
]
=
roi_y
-
wy
+
hy
;
px
[
3
]
=
roi_x
-
wx
-
hx
;
py
[
3
]
=
roi_y
-
wy
-
hy
;
px
[
4
]
=
roi_x
+
wx
-
hx
;
py
[
4
]
=
roi_y
+
wy
-
hy
;
}
scalar_t
*
offset_bottom_diff
=
bottom_diff
+
(
n
*
channels
+
c
)
*
height
*
width
;
scalar_t
value_top_diff
=
top_diff
[
index
];
valueAdd
(
bottom_diff
+
index
,
value_top_diff
);
for
(
int
i
=
0
;
i
<
points
;
i
++
)
{
scalar_t
w1
,
w2
,
w3
,
w4
;
int
x_low
,
x_high
,
y_low
,
y_high
;
bilinear_interpolate_gradient
<
scalar_t
>
(
height
,
width
,
py
[
i
],
px
[
i
],
w1
,
w2
,
w3
,
w4
,
x_low
,
x_high
,
y_low
,
y_high
,
i
);
scalar_t
g1
=
value_top_diff
*
w1
;
scalar_t
g2
=
value_top_diff
*
w2
;
scalar_t
g3
=
value_top_diff
*
w3
;
scalar_t
g4
=
value_top_diff
*
w4
;
if
(
x_low
>=
0
&&
x_high
>=
0
&&
y_low
>=
0
&&
y_high
>=
0
)
{
valueAdd
(
offset_bottom_diff
+
y_low
*
width
+
x_low
,
g1
);
valueAdd
(
offset_bottom_diff
+
y_low
*
width
+
x_high
,
g2
);
valueAdd
(
offset_bottom_diff
+
y_high
*
width
+
x_low
,
g3
);
valueAdd
(
offset_bottom_diff
+
y_high
*
width
+
x_high
,
g4
);
}
}
}
}
void
rotated_feature_align_forward_cpu
(
const
Tensor
features
,
const
Tensor
best_bboxes
,
const
float
spatial_scale
,
const
int
points
,
Tensor
output
)
{
const
int
output_size
=
features
.
numel
();
AT_DISPATCH_FLOATING_TYPES
(
features
.
scalar_type
(),
"rotated_feature_align_forward_cpu_kernel"
,
[
&
]
{
const
scalar_t
*
bottom_data
=
features
.
data_ptr
<
scalar_t
>
();
const
scalar_t
*
bboxes_data
=
best_bboxes
.
data_ptr
<
scalar_t
>
();
scalar_t
*
top_data
=
output
.
data_ptr
<
scalar_t
>
();
rotated_feature_align_forward_cpu_kernel
<
scalar_t
>
(
output_size
,
points
,
bottom_data
,
bboxes_data
,
scalar_t
(
spatial_scale
),
features
.
size
(
1
),
features
.
size
(
2
),
features
.
size
(
3
),
top_data
);
});
}
void
rotated_feature_align_backward_cpu
(
const
Tensor
top_grad
,
const
Tensor
best_bboxes
,
const
float
spatial_scale
,
const
int
points
,
Tensor
bottom_grad
)
{
const
int
output_size
=
top_grad
.
numel
();
AT_DISPATCH_FLOATING_TYPES
(
top_grad
.
scalar_type
(),
"rotated_feature_align_backward_cpu_kernel"
,
[
&
]
{
const
scalar_t
*
top_diff
=
top_grad
.
data_ptr
<
scalar_t
>
();
const
scalar_t
*
bboxes_data
=
best_bboxes
.
data_ptr
<
scalar_t
>
();
scalar_t
*
bottom_diff
=
bottom_grad
.
data_ptr
<
scalar_t
>
();
rotated_feature_align_backward_cpu_kernel
<
scalar_t
>
(
output_size
,
points
,
top_diff
,
bboxes_data
,
scalar_t
(
spatial_scale
),
top_grad
.
size
(
1
),
top_grad
.
size
(
2
),
top_grad
.
size
(
3
),
bottom_diff
);
});
}
void
rotated_feature_align_forward_impl
(
const
Tensor
features
,
const
Tensor
best_bboxes
,
const
float
spatial_scale
,
const
int
points
,
Tensor
output
);
void
rotated_feature_align_backward_impl
(
const
Tensor
top_grad
,
const
Tensor
best_bboxes
,
const
float
spatial_scale
,
const
int
points
,
Tensor
bottom_grad
);
REGISTER_DEVICE_IMPL
(
rotated_feature_align_forward_impl
,
CPU
,
rotated_feature_align_forward_cpu
);
REGISTER_DEVICE_IMPL
(
rotated_feature_align_backward_impl
,
CPU
,
rotated_feature_align_backward_cpu
);
mmcv/ops/csrc/pytorch/cpu/sparse_indice.cpp
0 → 100644
View file @
fdeee889
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <torch/script.h>
#include <utils/spconv/spconv/geometry.h>
#include <utils/spconv/spconv/indice.h>
#include "pytorch_cpp_helper.hpp"
namespace
functor
{
template
<
typename
Index
,
typename
IndexGrid
,
unsigned
NDim
>
struct
CreateConvIndicePairFunctor
<
tv
::
CPU
,
Index
,
IndexGrid
,
NDim
>
{
Index
operator
()(
const
tv
::
CPU
&
d
,
tv
::
TensorView
<
const
Index
>
indicesIn
,
tv
::
TensorView
<
Index
>
indicesOut
,
tv
::
TensorView
<
IndexGrid
>
gridsOut
,
tv
::
TensorView
<
Index
>
indicePairs
,
tv
::
TensorView
<
Index
>
indiceNum
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
kernelSize
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
stride
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
padding
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
dilation
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
outSpatialShape
,
bool
transpose
,
bool
resetGrid
)
{
if
(
transpose
)
return
getIndicePairsDeConv
<
Index
,
IndexGrid
,
NDim
>
(
indicesIn
,
indicesOut
,
gridsOut
,
indicePairs
,
indiceNum
,
kernelSize
.
data
(),
stride
.
data
(),
padding
.
data
(),
dilation
.
data
(),
outSpatialShape
.
data
());
else
return
getIndicePairsConv
<
Index
,
IndexGrid
,
NDim
>
(
indicesIn
,
indicesOut
,
gridsOut
,
indicePairs
,
indiceNum
,
kernelSize
.
data
(),
stride
.
data
(),
padding
.
data
(),
dilation
.
data
(),
outSpatialShape
.
data
());
}
};
template
<
typename
Index
,
typename
IndexGrid
,
unsigned
NDim
>
struct
CreateSubMIndicePairFunctor
<
tv
::
CPU
,
Index
,
IndexGrid
,
NDim
>
{
Index
operator
()(
const
tv
::
CPU
&
d
,
tv
::
TensorView
<
const
Index
>
indicesIn
,
tv
::
TensorView
<
IndexGrid
>
gridsOut
,
tv
::
TensorView
<
Index
>
indicePairs
,
tv
::
TensorView
<
Index
>
indiceNum
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
kernelSize
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
stride
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
padding
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
dilation
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
outSpatialShape
,
bool
transpose
,
bool
resetGrid
)
{
return
getIndicePairsSubM
<
Index
,
IndexGrid
,
NDim
>
(
indicesIn
,
gridsOut
,
indicePairs
,
indiceNum
,
kernelSize
.
data
(),
stride
.
data
(),
padding
.
data
(),
dilation
.
data
(),
outSpatialShape
.
data
());
}
};
}
// namespace functor
#define DECLARE_CPU_SPECS_INDEX_NDIM(Index, NDIM) \
template struct functor::CreateConvIndicePairFunctor<tv::CPU, Index, int, \
NDIM>; \
template struct functor::CreateSubMIndicePairFunctor<tv::CPU, Index, int, \
NDIM>;
#define DECLARE_CPU_INDEX(Index) \
DECLARE_CPU_SPECS_INDEX_NDIM(Index, 1); \
DECLARE_CPU_SPECS_INDEX_NDIM(Index, 2); \
DECLARE_CPU_SPECS_INDEX_NDIM(Index, 3); \
DECLARE_CPU_SPECS_INDEX_NDIM(Index, 4);
DECLARE_CPU_INDEX
(
int
);
DECLARE_CPU_INDEX
(
long
);
#undef DECLARE_CPU_INDEX
#undef DECLARE_CPU_SPECS_INDEX_NDIM
mmcv/ops/csrc/pytorch/cpu/sparse_maxpool.cpp
0 → 100644
View file @
fdeee889
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <torch/script.h>
#include <utils/spconv/spconv/maxpool.h>
#include "pytorch_cpp_helper.hpp"
namespace
functor
{
template
<
typename
scalar_t
,
typename
Index
>
struct
SparseMaxPoolForwardFunctor
<
tv
::
CPU
,
scalar_t
,
Index
>
{
void
operator
()(
const
tv
::
CPU
&
d
,
tv
::
TensorView
<
scalar_t
>
outFeatures
,
tv
::
TensorView
<
const
scalar_t
>
inFeatures
,
tv
::
TensorView
<
const
Index
>
indices
,
int
size
)
{
int
stride
=
outFeatures
.
dim
(
1
);
auto
outFeaturesData
=
outFeatures
.
data
();
auto
inFeaturesData
=
inFeatures
.
data
();
auto
indicesIn
=
indices
.
subview
(
0
).
data
();
auto
indicesOut
=
indices
.
subview
(
1
).
data
();
Index
idxi
,
idxo
;
for
(
int
row
=
0
;
row
<
size
;
row
++
)
{
idxi
=
indicesIn
[
row
]
*
stride
;
idxo
=
indicesOut
[
row
]
*
stride
;
for
(
int
plane
=
0
;
plane
<
stride
;
++
plane
)
if
(
outFeaturesData
[
idxo
+
plane
]
<
inFeaturesData
[
idxi
+
plane
])
outFeaturesData
[
idxo
+
plane
]
=
inFeaturesData
[
idxi
+
plane
];
}
}
};
template
<
typename
scalar_t
,
typename
Index
>
struct
SparseMaxPoolBackwardFunctor
<
tv
::
CPU
,
scalar_t
,
Index
>
{
void
operator
()(
const
tv
::
CPU
&
d
,
tv
::
TensorView
<
const
scalar_t
>
outFeatures
,
tv
::
TensorView
<
const
scalar_t
>
inFeatures
,
tv
::
TensorView
<
const
scalar_t
>
fout
,
tv
::
TensorView
<
scalar_t
>
fin
,
tv
::
TensorView
<
const
Index
>
indices
,
int
size
)
{
int
stride
=
outFeatures
.
dim
(
1
);
auto
outFeaturesData
=
outFeatures
.
data
();
auto
inFeaturesData
=
inFeatures
.
data
();
auto
foutData
=
fout
.
data
();
auto
finData
=
fin
.
data
();
auto
indicesIn
=
indices
.
subview
(
0
).
data
();
auto
indicesOut
=
indices
.
subview
(
1
).
data
();
Index
idxi
,
idxo
;
for
(
int
row
=
0
;
row
<
size
;
row
++
)
{
idxi
=
indicesIn
[
row
]
*
stride
;
idxo
=
indicesOut
[
row
]
*
stride
;
for
(
int
plane
=
0
;
plane
<
stride
;
++
plane
)
if
(
outFeaturesData
[
idxo
+
plane
]
==
inFeaturesData
[
idxi
+
plane
])
finData
[
idxi
+
plane
]
+=
foutData
[
idxo
+
plane
];
}
}
};
}
// namespace functor
#define DECLARE_CPU_SPECS_T_INDEX(T, Index) \
template struct functor::SparseMaxPoolForwardFunctor<tv::CPU, T, Index>; \
template struct functor::SparseMaxPoolBackwardFunctor<tv::CPU, T, Index>;
#define DECLARE_CPU_SPECS(T) \
DECLARE_CPU_SPECS_T_INDEX(T, int); \
DECLARE_CPU_SPECS_T_INDEX(T, long);
DECLARE_CPU_SPECS
(
float
);
DECLARE_CPU_SPECS
(
double
);
DECLARE_CPU_SPECS
(
at
::
Half
);
#undef DECLARE_CPU_SPECS
#undef DECLARE_CPU_SPECS_T_INDEX
mmcv/ops/csrc/pytorch/cpu/sparse_reordering.cpp
0 → 100644
View file @
fdeee889
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <torch/script.h>
#include <utils/spconv/spconv/reordering.h>
#include "pytorch_cpp_helper.hpp"
namespace
functor
{
template
<
typename
scalar_t
,
typename
Index
>
struct
SparseGatherFunctor
<
tv
::
CPU
,
scalar_t
,
Index
>
{
void
operator
()(
const
tv
::
CPU
&
d
,
tv
::
TensorView
<
scalar_t
>
buffer
,
tv
::
TensorView
<
const
scalar_t
>
features
,
tv
::
TensorView
<
const
Index
>
indices
,
int
size
)
{
int
numPlanes
=
features
.
dim
(
1
);
for
(
int
i
=
0
;
i
<
size
;
++
i
)
{
std
::
memcpy
(
buffer
.
data
()
+
i
*
numPlanes
,
features
.
data
()
+
indices
[
i
]
*
numPlanes
,
sizeof
(
scalar_t
)
*
numPlanes
);
}
}
};
template
<
typename
scalar_t
,
typename
Index
>
struct
SparseScatterAddFunctor
<
tv
::
CPU
,
scalar_t
,
Index
>
{
void
operator
()(
const
tv
::
CPU
&
d
,
tv
::
TensorView
<
scalar_t
>
outFeatures
,
tv
::
TensorView
<
const
scalar_t
>
buffer
,
tv
::
TensorView
<
const
Index
>
indices
,
int
size
,
bool
stable
)
{
int
numPlanes
=
outFeatures
.
dim
(
1
);
const
scalar_t
*
buf
=
buffer
.
data
();
scalar_t
*
out
=
outFeatures
.
data
();
for
(
int
i
=
0
;
i
<
size
;
++
i
)
{
buf
=
buffer
.
data
()
+
i
*
numPlanes
;
out
=
outFeatures
.
data
()
+
indices
[
i
]
*
numPlanes
;
for
(
int
j
=
0
;
j
<
numPlanes
;
++
j
)
{
out
[
j
]
+=
buf
[
j
];
}
}
}
};
}
// namespace functor
#define DECLARE_CPU_SPECS_T_INDEX(scalar_t, Index) \
template struct functor::SparseGatherFunctor<tv::CPU, scalar_t, Index>; \
template struct functor::SparseScatterAddFunctor<tv::CPU, scalar_t, Index>;
#define DECLARE_CPU_SPECS(scalar_t) \
DECLARE_CPU_SPECS_T_INDEX(scalar_t, int); \
DECLARE_CPU_SPECS_T_INDEX(scalar_t, long);
DECLARE_CPU_SPECS
(
float
);
DECLARE_CPU_SPECS
(
double
);
DECLARE_CPU_SPECS
(
at
::
Half
);
#undef DECLARE_CPU_SPECS
#undef DECLARE_CPU_SPECS_T_INDEX
mmcv/ops/csrc/pytorch/cpu/voxelization.cpp
View file @
fdeee889
...
@@ -26,13 +26,22 @@ void dynamic_voxelize_forward_cpu_kernel(
...
@@ -26,13 +26,22 @@ void dynamic_voxelize_forward_cpu_kernel(
coor
[
ndim_minus_1
-
j
]
=
c
;
coor
[
ndim_minus_1
-
j
]
=
c
;
}
}
if
(
failed
)
// memcpy and memset will cause problem because of the memory distribution
memset
(
&
coors
[
i
][
0
],
-
1
,
NDim
*
sizeof
(
T_int
));
// discontinuity of TensorAccessor, so here using loops to replace memcpy
else
// or memset
memcpy
(
&
coors
[
i
][
0
],
&
coor
[
0
],
NDim
*
sizeof
(
T_int
));
if
(
failed
)
{
for
(
int
k
=
0
;
k
<
NDim
;
++
k
)
{
coors
[
i
][
k
]
=
-
1
;
}
}
else
{
for
(
int
k
=
0
;
k
<
NDim
;
++
k
)
{
coors
[
i
][
k
]
=
coor
[
k
];
}
}
}
}
delete
[]
coor
;
delete
[]
coor
;
return
;
}
}
template
<
typename
T
,
typename
T_int
>
template
<
typename
T
,
typename
T_int
>
...
@@ -72,14 +81,21 @@ void hard_voxelize_forward_cpu_kernel(
...
@@ -72,14 +81,21 @@ void hard_voxelize_forward_cpu_kernel(
voxel_num
+=
1
;
voxel_num
+=
1
;
coor_to_voxelidx
[
coor
[
i
][
0
]][
coor
[
i
][
1
]][
coor
[
i
][
2
]]
=
voxelidx
;
coor_to_voxelidx
[
coor
[
i
][
0
]][
coor
[
i
][
1
]][
coor
[
i
][
2
]]
=
voxelidx
;
memcpy
(
&
coors
[
voxelidx
][
0
],
&
coor
[
i
][
0
],
NDim
*
sizeof
(
T_int
));
// memcpy will cause problem because of the memory distribution
// discontinuity of TensorAccessor, so here using loops to replace memcpy
for
(
int
k
=
0
;
k
<
NDim
;
++
k
)
{
coors
[
voxelidx
][
k
]
=
coor
[
i
][
k
];
}
}
}
// put points into voxel
// put points into voxel
num
=
num_points_per_voxel
[
voxelidx
];
num
=
num_points_per_voxel
[
voxelidx
];
if
(
max_points
==
-
1
||
num
<
max_points
)
{
if
(
max_points
==
-
1
||
num
<
max_points
)
{
memcpy
(
&
voxels
[
voxelidx
][
num
][
0
],
&
points
[
i
][
0
],
// memcpy will cause problem because of the memory distribution
num_features
*
sizeof
(
T
));
// discontinuity of TensorAccessor, so here using loops to replace memcpy
for
(
int
k
=
0
;
k
<
num_features
;
++
k
)
{
voxels
[
voxelidx
][
num
][
k
]
=
points
[
i
][
k
];
}
num_points_per_voxel
[
voxelidx
]
+=
1
;
num_points_per_voxel
[
voxelidx
]
+=
1
;
}
}
}
}
...
...
mmcv/ops/csrc/pytorch/cuda/active_rotated_filter_cuda.cu
0 → 100644
View file @
fdeee889
// Copyright (c) OpenMMLab. All rights reserved.
// Modified from
// https://github.com/csuhan/s2anet/blob/master/mmdet/ops/orn/src/cuda/ActiveRotatingFilter_cuda.cu
#include "active_rotated_filter_cuda_kernel.cuh"
#include "pytorch_cuda_helper.hpp"
void
ActiveRotatedFilterForwardCUDAKernelLauncher
(
const
Tensor
input
,
const
Tensor
indices
,
Tensor
output
)
{
int
num_output_planes
=
input
.
size
(
0
);
int
num_input_planes
=
input
.
size
(
1
);
int
num_orientations
=
input
.
size
(
2
);
int
kH
=
input
.
size
(
3
);
int
kW
=
input
.
size
(
4
);
int
num_rotations
=
indices
.
size
(
3
);
int
nEntry
=
num_orientations
*
kH
*
kW
;
int
output_size
=
input
.
numel
();
at
::
cuda
::
CUDAGuard
device_guard
(
input
.
device
());
cudaStream_t
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
AT_DISPATCH_FLOATING_TYPES_AND_HALF
(
input
.
scalar_type
(),
"active_rotated_filter_forward_cuda_kernel"
,
[
&
]
{
active_rotated_filter_forward_cuda_kernel
<
scalar_t
>
<<<
GET_BLOCKS
(
output_size
),
THREADS_PER_BLOCK
,
0
,
stream
>>>
(
output_size
,
input
.
data_ptr
<
scalar_t
>
(),
indices
.
data_ptr
<
int
>
(),
num_input_planes
,
num_output_planes
,
num_orientations
,
num_rotations
,
nEntry
,
output
.
data_ptr
<
scalar_t
>
());
});
AT_CUDA_CHECK
(
cudaGetLastError
());
}
void
ActiveRotatedFilterBackwardCUDAKernelLauncher
(
const
Tensor
grad_out
,
const
Tensor
indices
,
Tensor
grad_in
)
{
int
num_orientations
=
indices
.
size
(
0
);
int
kH
=
indices
.
size
(
1
);
int
kW
=
indices
.
size
(
2
);
int
num_rotations
=
indices
.
size
(
3
);
int
num_output_planes
=
grad_out
.
size
(
0
)
/
num_rotations
;
int
num_input_planes
=
grad_out
.
size
(
1
)
/
num_orientations
;
int
nEntry
=
num_orientations
*
kH
*
kW
;
int
output_size
=
grad_in
.
numel
();
at
::
cuda
::
CUDAGuard
device_guard
(
indices
.
device
());
cudaStream_t
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
AT_DISPATCH_FLOATING_TYPES_AND_HALF
(
grad_out
.
scalar_type
(),
"active_rotated_filter_backward_cuda_kernel"
,
[
&
]
{
active_rotated_filter_backward_cuda_kernel
<
scalar_t
>
<<<
GET_BLOCKS
(
output_size
),
THREADS_PER_BLOCK
,
0
,
stream
>>>
(
output_size
,
grad_out
.
data_ptr
<
scalar_t
>
(),
indices
.
data_ptr
<
int
>
(),
num_input_planes
,
num_output_planes
,
num_orientations
,
num_rotations
,
nEntry
,
grad_in
.
data_ptr
<
scalar_t
>
());
});
AT_CUDA_CHECK
(
cudaGetLastError
());
}
mmcv/ops/csrc/pytorch/cuda/assign_score_withk_cuda.cu
View file @
fdeee889
...
@@ -13,7 +13,7 @@ void AssignScoreWithKForwardCUDAKernelLauncher(
...
@@ -13,7 +13,7 @@ void AssignScoreWithKForwardCUDAKernelLauncher(
at
::
cuda
::
CUDAGuard
device_guard
(
points
.
device
());
at
::
cuda
::
CUDAGuard
device_guard
(
points
.
device
());
cudaStream_t
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
cudaStream_t
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
dim3
blocks
(
DIVUP
(
B
*
O
*
N1
*
K
,
THREADS_PER_BLOCK
));
dim3
blocks
(
GET_BLOCKS
(
B
*
O
*
N1
*
K
,
THREADS_PER_BLOCK
));
dim3
threads
(
THREADS_PER_BLOCK
);
dim3
threads
(
THREADS_PER_BLOCK
);
AT_DISPATCH_FLOATING_TYPES_AND_HALF
(
AT_DISPATCH_FLOATING_TYPES_AND_HALF
(
...
@@ -36,9 +36,9 @@ void AssignScoreWithKBackwardCUDAKernelLauncher(
...
@@ -36,9 +36,9 @@ void AssignScoreWithKBackwardCUDAKernelLauncher(
at
::
cuda
::
CUDAGuard
device_guard
(
grad_out
.
device
());
at
::
cuda
::
CUDAGuard
device_guard
(
grad_out
.
device
());
cudaStream_t
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
cudaStream_t
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
dim3
blocks1
(
DIVUP
(
B
*
M
*
O
,
THREADS_PER_BLOCK
));
dim3
blocks1
(
GET_BLOCKS
(
B
*
M
*
O
,
THREADS_PER_BLOCK
));
dim3
threads1
(
THREADS_PER_BLOCK
);
dim3
threads1
(
THREADS_PER_BLOCK
);
dim3
blocks2
(
DIVUP
(
B
*
N1
*
K
*
M
,
THREADS_PER_BLOCK
));
dim3
blocks2
(
GET_BLOCKS
(
B
*
N1
*
K
*
M
,
THREADS_PER_BLOCK
));
dim3
threads2
(
THREADS_PER_BLOCK
);
dim3
threads2
(
THREADS_PER_BLOCK
);
AT_DISPATCH_FLOATING_TYPES_AND_HALF
(
AT_DISPATCH_FLOATING_TYPES_AND_HALF
(
...
...
mmcv/ops/csrc/pytorch/cuda/ball_query_cuda.cu
View file @
fdeee889
...
@@ -22,7 +22,7 @@ void BallQueryForwardCUDAKernelLauncher(int b, int n, int m, float min_radius,
...
@@ -22,7 +22,7 @@ void BallQueryForwardCUDAKernelLauncher(int b, int n, int m, float min_radius,
cudaStream_t
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
cudaStream_t
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
// blockIdx.x(col), blockIdx.y(row)
// blockIdx.x(col), blockIdx.y(row)
dim3
blocks
(
DIVUP
(
m
,
THREADS_PER_BLOCK
),
b
);
dim3
blocks
(
GET_BLOCKS
(
m
,
THREADS_PER_BLOCK
),
b
);
dim3
threads
(
THREADS_PER_BLOCK
);
dim3
threads
(
THREADS_PER_BLOCK
);
AT_DISPATCH_FLOATING_TYPES_AND_HALF
(
AT_DISPATCH_FLOATING_TYPES_AND_HALF
(
...
...
mmcv/ops/csrc/pytorch/cuda/bbox_overlaps_cuda.cu
View file @
fdeee889
...
@@ -2,6 +2,22 @@
...
@@ -2,6 +2,22 @@
#include "bbox_overlaps_cuda_kernel.cuh"
#include "bbox_overlaps_cuda_kernel.cuh"
#include "pytorch_cuda_helper.hpp"
#include "pytorch_cuda_helper.hpp"
// Disable fp16 on ROCm device
#ifndef HIP_DIFF
#if __CUDA_ARCH__ >= 530
template
<
>
__global__
void
bbox_overlaps_cuda_kernel
<
at
::
Half
>
(
const
at
::
Half
*
bbox1
,
const
at
::
Half
*
bbox2
,
at
::
Half
*
ious
,
const
int
num_bbox1
,
const
int
num_bbox2
,
const
int
mode
,
const
bool
aligned
,
const
int
offset
)
{
bbox_overlaps_cuda_kernel_half
(
reinterpret_cast
<
const
__half
*>
(
bbox1
),
reinterpret_cast
<
const
__half
*>
(
bbox2
),
reinterpret_cast
<
__half
*>
(
ious
),
num_bbox1
,
num_bbox2
,
mode
,
aligned
,
offset
);
}
#endif // __CUDA_ARCH__ >= 530
#endif // HIP_DIFF
void
BBoxOverlapsCUDAKernelLauncher
(
const
Tensor
bboxes1
,
const
Tensor
bboxes2
,
void
BBoxOverlapsCUDAKernelLauncher
(
const
Tensor
bboxes1
,
const
Tensor
bboxes2
,
Tensor
ious
,
const
int
mode
,
Tensor
ious
,
const
int
mode
,
const
bool
aligned
,
const
int
offset
)
{
const
bool
aligned
,
const
int
offset
)
{
...
...
mmcv/ops/csrc/pytorch/cuda/chamfer_distance_cuda.cu
0 → 100644
View file @
fdeee889
// Copyright (c) OpenMMLab. All rights reserved.
// Modified from
// https://github.com/chrdiller/pyTorchChamferDistance/blob/master/chamfer_distance/chamfer_distance.cpp
#include "chamfer_distance_cuda_kernel.cuh"
#include "pytorch_cuda_helper.hpp"
void
ChamferDistanceForwardCUDAKernelLauncher
(
const
Tensor
xyz1
,
const
Tensor
xyz2
,
const
Tensor
dist1
,
const
Tensor
dist2
,
const
Tensor
idx1
,
const
Tensor
idx2
)
{
int
batch_size
=
xyz1
.
size
(
0
);
int
n
=
xyz1
.
size
(
1
);
int
m
=
xyz2
.
size
(
1
);
at
::
cuda
::
CUDAGuard
device_guard
(
xyz1
.
device
());
cudaStream_t
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
AT_DISPATCH_FLOATING_TYPES_AND_HALF
(
xyz1
.
scalar_type
(),
"chamfer_distance_forward_cuda_kernel"
,
[
&
]
{
chamfer_distance_forward_cuda_kernel
<
scalar_t
>
<<<
GET_BLOCKS
(
batch_size
*
n
),
THREADS_PER_BLOCK
,
0
,
stream
>>>
(
batch_size
,
n
,
xyz1
.
data_ptr
<
scalar_t
>
(),
m
,
xyz2
.
data_ptr
<
scalar_t
>
(),
dist1
.
data_ptr
<
scalar_t
>
(),
idx1
.
data_ptr
<
int
>
());
});
AT_DISPATCH_FLOATING_TYPES_AND_HALF
(
xyz1
.
scalar_type
(),
"chamfer_distance_forward_cuda_kernel"
,
[
&
]
{
chamfer_distance_forward_cuda_kernel
<
scalar_t
>
<<<
GET_BLOCKS
(
batch_size
*
m
),
THREADS_PER_BLOCK
,
0
,
stream
>>>
(
batch_size
,
m
,
xyz2
.
data_ptr
<
scalar_t
>
(),
n
,
xyz1
.
data_ptr
<
scalar_t
>
(),
dist2
.
data_ptr
<
scalar_t
>
(),
idx2
.
data_ptr
<
int
>
());
});
AT_CUDA_CHECK
(
cudaGetLastError
());
}
void
ChamferDistanceBackwardCUDAKernelLauncher
(
const
Tensor
xyz1
,
const
Tensor
xyz2
,
Tensor
grad_xyz1
,
Tensor
grad_xyz2
,
Tensor
grad_dist1
,
Tensor
grad_dist2
,
Tensor
idx1
,
Tensor
idx2
)
{
int
batch_size
=
xyz1
.
size
(
0
);
int
n
=
xyz1
.
size
(
1
);
int
m
=
xyz2
.
size
(
1
);
at
::
cuda
::
CUDAGuard
device_guard
(
xyz1
.
device
());
cudaStream_t
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
AT_DISPATCH_FLOATING_TYPES_AND_HALF
(
xyz1
.
scalar_type
(),
"chamfer_distance_backward_cuda_kernel"
,
[
&
]
{
chamfer_distance_backward_cuda_kernel
<
scalar_t
>
<<<
GET_BLOCKS
(
batch_size
*
n
),
THREADS_PER_BLOCK
/
2
,
0
,
stream
>>>
(
batch_size
,
m
,
xyz1
.
data_ptr
<
scalar_t
>
(),
n
,
xyz2
.
data_ptr
<
scalar_t
>
(),
grad_dist1
.
data_ptr
<
scalar_t
>
(),
idx1
.
data_ptr
<
int
>
(),
grad_xyz1
.
data_ptr
<
scalar_t
>
(),
grad_xyz2
.
data_ptr
<
scalar_t
>
());
});
AT_DISPATCH_FLOATING_TYPES_AND_HALF
(
xyz1
.
scalar_type
(),
"chamfer_distance_backward_cuda_kernel"
,
[
&
]
{
chamfer_distance_backward_cuda_kernel
<
scalar_t
>
<<<
GET_BLOCKS
(
batch_size
*
m
),
THREADS_PER_BLOCK
/
2
,
0
,
stream
>>>
(
batch_size
,
n
,
xyz2
.
data_ptr
<
scalar_t
>
(),
m
,
xyz1
.
data_ptr
<
scalar_t
>
(),
grad_dist2
.
data_ptr
<
scalar_t
>
(),
idx2
.
data_ptr
<
int
>
(),
grad_xyz2
.
data_ptr
<
scalar_t
>
(),
grad_xyz1
.
data_ptr
<
scalar_t
>
());
});
AT_CUDA_CHECK
(
cudaGetLastError
());
}
mmcv/ops/csrc/pytorch/cuda/convex_iou.cu
0 → 100644
View file @
fdeee889
// Copyright (c) OpenMMLab. All rights reserved
// modified from
// https://github.com/SDL-GuoZonghao/BeyondBoundingBox/blob/main/mmdet/ops/iou/src/convex_iou_kernel.cu
#include "convex_iou_cuda_kernel.cuh"
#include "pytorch_cuda_helper.hpp"
void
ConvexIoUCUDAKernelLauncher
(
const
Tensor
pointsets
,
const
Tensor
polygons
,
Tensor
ious
)
{
int
output_size
=
ious
.
numel
();
int
num_pointsets
=
pointsets
.
size
(
0
);
int
num_polygons
=
polygons
.
size
(
0
);
at
::
cuda
::
CUDAGuard
device_guard
(
pointsets
.
device
());
cudaStream_t
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
AT_DISPATCH_FLOATING_TYPES_AND_HALF
(
pointsets
.
scalar_type
(),
"convex_iou_cuda_kernel"
,
([
&
]
{
convex_iou_cuda_kernel
<
scalar_t
>
<<<
GET_BLOCKS
(
output_size
),
THREADS_PER_BLOCK
/
2
,
0
,
stream
>>>
(
num_pointsets
,
num_polygons
,
pointsets
.
data_ptr
<
scalar_t
>
(),
polygons
.
data_ptr
<
scalar_t
>
(),
ious
.
data_ptr
<
scalar_t
>
());
}));
AT_CUDA_CHECK
(
cudaGetLastError
());
}
void
ConvexGIoUCUDAKernelLauncher
(
const
Tensor
pointsets
,
const
Tensor
polygons
,
Tensor
output
)
{
int
output_size
=
output
.
numel
();
int
num_pointsets
=
pointsets
.
size
(
0
);
int
num_polygons
=
polygons
.
size
(
0
);
at
::
cuda
::
CUDAGuard
device_guard
(
pointsets
.
device
());
cudaStream_t
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
AT_DISPATCH_FLOATING_TYPES_AND_HALF
(
pointsets
.
scalar_type
(),
"convex_giou_cuda_kernel"
,
([
&
]
{
convex_giou_cuda_kernel
<
scalar_t
>
<<<
GET_BLOCKS
(
output_size
),
THREADS_PER_BLOCK
/
2
,
0
,
stream
>>>
(
num_pointsets
,
num_polygons
,
pointsets
.
data_ptr
<
scalar_t
>
(),
polygons
.
data_ptr
<
scalar_t
>
(),
output
.
data_ptr
<
scalar_t
>
());
}));
AT_CUDA_CHECK
(
cudaGetLastError
());
}
mmcv/ops/csrc/pytorch/cuda/correlation_cuda.cu
View file @
fdeee889
...
@@ -24,8 +24,8 @@ void CorrelationForwardCUDAKernelLauncher(Tensor input1, Tensor input2,
...
@@ -24,8 +24,8 @@ void CorrelationForwardCUDAKernelLauncher(Tensor input1, Tensor input2,
auto
trInput1
=
input1
.
permute
({
0
,
2
,
3
,
1
}).
contiguous
();
auto
trInput1
=
input1
.
permute
({
0
,
2
,
3
,
1
}).
contiguous
();
auto
trInput2
=
input2
.
permute
({
0
,
2
,
3
,
1
}).
contiguous
();
auto
trInput2
=
input2
.
permute
({
0
,
2
,
3
,
1
}).
contiguous
();
const
int
threads
=
THREADS_FORWARD
;
const
dim3
threads
(
WARP_SIZE
,
4
,
4
)
;
const
dim3
blocks
(
batch_size
,
oH
,
oW
);
const
dim3
blocks
(
batch_size
,
(
oH
+
3
)
>>
2
,
(
oW
+
3
)
>>
2
);
at
::
cuda
::
CUDAGuard
device_guard
(
input1
.
device
());
at
::
cuda
::
CUDAGuard
device_guard
(
input1
.
device
());
...
@@ -56,17 +56,20 @@ void CorrelationBackwardCUDAKernelLauncher(
...
@@ -56,17 +56,20 @@ void CorrelationBackwardCUDAKernelLauncher(
const
int
iW
=
input1
.
size
(
3
);
const
int
iW
=
input1
.
size
(
3
);
const
int
C
=
input1
.
size
(
1
);
const
int
C
=
input1
.
size
(
1
);
const
dim3
blocks
(
C
,
iH
,
iW
);
auto
trInput1
=
input1
.
permute
({
0
,
2
,
3
,
1
}).
contiguous
();
const
dim3
threads
(
THREADS_BACKWARD
,
THREADS_BACKWARD
);
auto
trInput2
=
input2
.
permute
({
0
,
2
,
3
,
1
}).
contiguous
();
const
dim3
blocks
(
batch_size
,
iH
,
iW
);
const
dim3
threads
(
THREADS_PER_BLOCK
);
at
::
cuda
::
CUDAGuard
device_guard
(
input1
.
device
());
at
::
cuda
::
CUDAGuard
device_guard
(
input1
.
device
());
AT_DISPATCH_FLOATING_TYPES_AND_HALF
(
AT_DISPATCH_FLOATING_TYPES_AND_HALF
(
input1
.
scalar_type
(),
"correlation_backward_cuda"
,
([
&
]
{
input1
.
scalar_type
(),
"correlation_backward_cuda"
,
([
&
]
{
const
int
grad_cache_size
=
patchH
*
patchW
*
sizeof
(
scalar_t
);
TensorAcc4R
input1_acc
=
TensorAcc4R
input1_acc
=
i
nput1
.
packed_accessor32
<
scalar_t
,
4
,
RestrictPtrTraits
>
();
trI
nput1
.
packed_accessor32
<
scalar_t
,
4
,
RestrictPtrTraits
>
();
TensorAcc4R
input2_acc
=
TensorAcc4R
input2_acc
=
i
nput2
.
packed_accessor32
<
scalar_t
,
4
,
RestrictPtrTraits
>
();
trI
nput2
.
packed_accessor32
<
scalar_t
,
4
,
RestrictPtrTraits
>
();
TensorAcc4R
grad_input1_acc
=
TensorAcc4R
grad_input1_acc
=
grad_input1
.
packed_accessor32
<
scalar_t
,
4
,
RestrictPtrTraits
>
();
grad_input1
.
packed_accessor32
<
scalar_t
,
4
,
RestrictPtrTraits
>
();
TensorAcc4R
grad_input2_acc
=
TensorAcc4R
grad_input2_acc
=
...
@@ -74,20 +77,18 @@ void CorrelationBackwardCUDAKernelLauncher(
...
@@ -74,20 +77,18 @@ void CorrelationBackwardCUDAKernelLauncher(
TensorAcc5R
grad_output_acc
=
TensorAcc5R
grad_output_acc
=
grad_output
.
packed_accessor32
<
scalar_t
,
5
,
RestrictPtrTraits
>
();
grad_output
.
packed_accessor32
<
scalar_t
,
5
,
RestrictPtrTraits
>
();
for
(
int
n
=
0
;
n
<
batch_size
;
++
n
)
{
correlation_backward_cuda_kernel_input1
<
scalar_t
>
correlation_backward_cuda_kernel_input1
<
scalar_t
>
<<<
blocks
,
threads
,
grad_cache_size
,
<<<
blocks
,
threads
,
0
,
at
::
cuda
::
getCurrentCUDAStream
()
>>>
(
at
::
cuda
::
getCurrentCUDAStream
()
>>>
(
grad_output_acc
,
input2_acc
,
grad_input1_acc
,
kH
,
kW
,
patchH
,
grad_output_acc
,
input2_acc
,
grad_input1_acc
,
kH
,
kW
,
patchH
,
patchW
,
padH
,
padW
,
dilationH
,
dilationW
,
dilation_patchH
,
patchW
,
padH
,
padW
,
dilationH
,
dilationW
,
dilation_patchH
,
dilation_patchW
,
dH
,
dW
,
n
);
dilation_patchW
,
dH
,
dW
);
}
for
(
int
n
=
0
;
n
<
batch_size
;
++
n
)
{
correlation_backward_cuda_kernel_input2
<
scalar_t
>
correlation_backward_cuda_kernel_input2
<
scalar_t
>
<<<
blocks
,
threads
,
grad_cache_size
,
<<<
blocks
,
threads
,
0
,
at
::
cuda
::
getCurrentCUDAStream
()
>>>
(
at
::
cuda
::
getCurrentCUDAStream
()
>>>
(
grad_output_acc
,
input1_acc
,
grad_input2_acc
,
kH
,
kW
,
patchH
,
grad_output_acc
,
input1_acc
,
grad_input2_acc
,
kH
,
kW
,
patchH
,
patchW
,
padH
,
padW
,
dilationH
,
dilationW
,
dilation_patchH
,
patchW
,
padH
,
padW
,
dilationH
,
dilationW
,
dilation_patchH
,
dilation_patchW
,
dH
,
dW
,
n
);
dilation_patchW
,
dH
,
dW
);
}
}));
}));
}
}
mmcv/ops/csrc/pytorch/cuda/cudabind.cpp
View file @
fdeee889
...
@@ -570,20 +570,15 @@ void IoU3DBoxesOverlapBevForwardCUDAKernelLauncher(const int num_a,
...
@@ -570,20 +570,15 @@ void IoU3DBoxesOverlapBevForwardCUDAKernelLauncher(const int num_a,
const
Tensor
boxes_b
,
const
Tensor
boxes_b
,
Tensor
ans_overlap
);
Tensor
ans_overlap
);
void
IoU3DBoxesIoUBevForwardCUDAKernelLauncher
(
const
int
num_a
,
void
IoU3DNMS3DForwardCUDAKernelLauncher
(
const
Tensor
boxes
,
const
Tensor
boxes_a
,
unsigned
long
long
*
mask
,
const
int
num_b
,
int
boxes_num
,
const
Tensor
boxes_b
,
float
nms_overlap_thresh
);
Tensor
ans_iou
);
void
IoU3DNMSForwardCUDAKernelLauncher
(
const
Tensor
boxes
,
void
IoU3DNMS3DNormalForwardCUDAKernelLauncher
(
const
Tensor
boxes
,
unsigned
long
long
*
mask
,
int
boxes_num
,
unsigned
long
long
*
mask
,
float
nms_overlap_thresh
);
int
boxes_num
,
float
nms_overlap_thresh
);
void
IoU3DNMSNormalForwardCUDAKernelLauncher
(
const
Tensor
boxes
,
unsigned
long
long
*
mask
,
int
boxes_num
,
float
nms_overlap_thresh
);
void
iou3d_boxes_overlap_bev_forward_cuda
(
const
int
num_a
,
const
Tensor
boxes_a
,
void
iou3d_boxes_overlap_bev_forward_cuda
(
const
int
num_a
,
const
Tensor
boxes_a
,
const
int
num_b
,
const
Tensor
boxes_b
,
const
int
num_b
,
const
Tensor
boxes_b
,
...
@@ -592,45 +587,35 @@ void iou3d_boxes_overlap_bev_forward_cuda(const int num_a, const Tensor boxes_a,
...
@@ -592,45 +587,35 @@ void iou3d_boxes_overlap_bev_forward_cuda(const int num_a, const Tensor boxes_a,
ans_overlap
);
ans_overlap
);
};
};
void
iou3d_boxes_iou_bev_forward_cuda
(
const
int
num_a
,
const
Tensor
boxes_a
,
void
iou3d_nms3d_forward_cuda
(
const
Tensor
boxes
,
unsigned
long
long
*
mask
,
const
int
num_b
,
const
Tensor
boxes_b
,
int
boxes_num
,
float
nms_overlap_thresh
)
{
Tensor
ans_iou
)
{
IoU3DNMS3DForwardCUDAKernelLauncher
(
boxes
,
mask
,
boxes_num
,
IoU3DBoxesIoUBevForwardCUDAKernelLauncher
(
num_a
,
boxes_a
,
num_b
,
boxes_b
,
nms_overlap_thresh
);
ans_iou
);
};
void
iou3d_nms_forward_cuda
(
const
Tensor
boxes
,
unsigned
long
long
*
mask
,
int
boxes_num
,
float
nms_overlap_thresh
)
{
IoU3DNMSForwardCUDAKernelLauncher
(
boxes
,
mask
,
boxes_num
,
nms_overlap_thresh
);
};
};
void
iou3d_nms_normal_forward_cuda
(
const
Tensor
boxes
,
unsigned
long
long
*
mask
,
void
iou3d_nms3d_normal_forward_cuda
(
const
Tensor
boxes
,
int
boxes_num
,
float
nms_overlap_thresh
)
{
unsigned
long
long
*
mask
,
int
boxes_num
,
IoU3DNMSNormalForwardCUDAKernelLauncher
(
boxes
,
mask
,
boxes_num
,
float
nms_overlap_thresh
)
{
nms_overlap_thresh
);
IoU3DNMS3DNormalForwardCUDAKernelLauncher
(
boxes
,
mask
,
boxes_num
,
nms_overlap_thresh
);
};
};
void
iou3d_boxes_overlap_bev_forward_impl
(
const
int
num_a
,
const
Tensor
boxes_a
,
void
iou3d_boxes_overlap_bev_forward_impl
(
const
int
num_a
,
const
Tensor
boxes_a
,
const
int
num_b
,
const
Tensor
boxes_b
,
const
int
num_b
,
const
Tensor
boxes_b
,
Tensor
ans_overlap
);
Tensor
ans_overlap
);
void
iou3d_boxes_iou_bev_forward_impl
(
const
int
num_a
,
const
Tensor
boxes_a
,
void
iou3d_nms3d_forward_impl
(
const
Tensor
boxes
,
unsigned
long
long
*
mask
,
const
int
num_b
,
const
Tensor
boxes_b
,
int
boxes_num
,
float
nms_overlap_thresh
);
Tensor
ans_iou
);
void
iou3d_nms_forward_impl
(
const
Tensor
boxes
,
unsigned
long
long
*
mask
,
int
boxes_num
,
float
nms_overlap_thresh
);
void
iou3d_nms_normal_forward_impl
(
const
Tensor
boxes
,
unsigned
long
long
*
mask
,
void
iou3d_nms3d_normal_forward_impl
(
const
Tensor
boxes
,
int
boxes_num
,
float
nms_overlap_thresh
);
unsigned
long
long
*
mask
,
int
boxes_num
,
float
nms_overlap_thresh
);
REGISTER_DEVICE_IMPL
(
iou3d_boxes_overlap_bev_forward_impl
,
CUDA
,
REGISTER_DEVICE_IMPL
(
iou3d_boxes_overlap_bev_forward_impl
,
CUDA
,
iou3d_boxes_overlap_bev_forward_cuda
);
iou3d_boxes_overlap_bev_forward_cuda
);
REGISTER_DEVICE_IMPL
(
iou3d_boxes_iou_bev_forward_impl
,
CUDA
,
REGISTER_DEVICE_IMPL
(
iou3d_nms3d_forward_impl
,
CUDA
,
iou3d_nms3d_forward_cuda
);
iou3d_boxes_iou_bev_forward_cuda
);
REGISTER_DEVICE_IMPL
(
iou3d_nms3d_normal_forward_impl
,
CUDA
,
REGISTER_DEVICE_IMPL
(
iou3d_nms_forward_impl
,
CUDA
,
iou3d_nms_forward_cuda
);
iou3d_nms3d_normal_forward_cuda
);
REGISTER_DEVICE_IMPL
(
iou3d_nms_normal_forward_impl
,
CUDA
,
iou3d_nms_normal_forward_cuda
);
void
KNNForwardCUDAKernelLauncher
(
int
b
,
int
n
,
int
m
,
int
nsample
,
void
KNNForwardCUDAKernelLauncher
(
int
b
,
int
n
,
int
m
,
int
nsample
,
const
Tensor
xyz
,
const
Tensor
new_xyz
,
const
Tensor
xyz
,
const
Tensor
new_xyz
,
...
@@ -924,20 +909,20 @@ REGISTER_DEVICE_IMPL(roi_align_forward_impl, CUDA, roi_align_forward_cuda);
...
@@ -924,20 +909,20 @@ REGISTER_DEVICE_IMPL(roi_align_forward_impl, CUDA, roi_align_forward_cuda);
REGISTER_DEVICE_IMPL
(
roi_align_backward_impl
,
CUDA
,
roi_align_backward_cuda
);
REGISTER_DEVICE_IMPL
(
roi_align_backward_impl
,
CUDA
,
roi_align_backward_cuda
);
void
ROIAlignRotatedForwardCUDAKernelLauncher
(
void
ROIAlignRotatedForwardCUDAKernelLauncher
(
const
at
::
Tensor
features
,
const
at
::
Tensor
rois
,
const
float
spatial_scale
,
const
at
::
Tensor
input
,
const
at
::
Tensor
rois
,
const
float
spatial_scale
,
const
int
sampl
e_num
,
const
bool
aligned
,
const
bool
clockwise
,
const
int
sampl
ing_ratio
,
const
bool
aligned
,
const
bool
clockwise
,
const
int
channels
,
const
int
height
,
const
int
width
,
const
int
num_rois
,
const
int
channels
,
const
int
height
,
const
int
width
,
const
int
num_rois
,
const
int
pooled_height
,
const
int
pooled_width
,
at
::
Tensor
output
);
const
int
pooled_height
,
const
int
pooled_width
,
at
::
Tensor
output
);
void
ROIAlignRotatedBackwardCUDAKernelLauncher
(
void
ROIAlignRotatedBackwardCUDAKernelLauncher
(
const
at
::
Tensor
top_grad
,
const
at
::
Tensor
rois
,
const
float
spatial_scale
,
const
at
::
Tensor
top_grad
,
const
at
::
Tensor
rois
,
const
float
spatial_scale
,
const
int
sampl
e_num
,
const
bool
aligned
,
const
bool
clockwise
,
const
int
sampl
ing_ratio
,
const
bool
aligned
,
const
bool
clockwise
,
const
int
channels
,
const
int
height
,
const
int
width
,
const
int
num_rois
,
const
int
channels
,
const
int
height
,
const
int
width
,
const
int
num_rois
,
const
int
pooled_height
,
const
int
pooled_width
,
at
::
Tensor
bottom_grad
);
const
int
pooled_height
,
const
int
pooled_width
,
at
::
Tensor
bottom_grad
);
void
roi_align_rotated_forward_cuda
(
Tensor
features
,
Tensor
rois
,
Tensor
output
,
void
roi_align_rotated_forward_cuda
(
Tensor
input
,
Tensor
rois
,
Tensor
output
,
int
aligned_height
,
int
aligned_width
,
int
aligned_height
,
int
aligned_width
,
float
spatial_scale
,
int
sampl
e
_ratio
,
float
spatial_scale
,
int
sampl
ing
_ratio
,
bool
aligned
,
bool
clockwise
)
{
bool
aligned
,
bool
clockwise
)
{
// Number of ROIs
// Number of ROIs
int
num_rois
=
rois
.
size
(
0
);
int
num_rois
=
rois
.
size
(
0
);
...
@@ -947,11 +932,11 @@ void roi_align_rotated_forward_cuda(Tensor features, Tensor rois, Tensor output,
...
@@ -947,11 +932,11 @@ void roi_align_rotated_forward_cuda(Tensor features, Tensor rois, Tensor output,
AT_ERROR
(
"wrong roi size"
);
AT_ERROR
(
"wrong roi size"
);
}
}
int
num_channels
=
features
.
size
(
1
);
int
num_channels
=
input
.
size
(
1
);
int
data_height
=
features
.
size
(
2
);
int
data_height
=
input
.
size
(
2
);
int
data_width
=
features
.
size
(
3
);
int
data_width
=
input
.
size
(
3
);
ROIAlignRotatedForwardCUDAKernelLauncher
(
ROIAlignRotatedForwardCUDAKernelLauncher
(
features
,
rois
,
spatial_scale
,
sampl
e
_ratio
,
aligned
,
clockwise
,
input
,
rois
,
spatial_scale
,
sampl
ing
_ratio
,
aligned
,
clockwise
,
num_channels
,
data_height
,
data_width
,
num_rois
,
aligned_height
,
num_channels
,
data_height
,
data_width
,
num_rois
,
aligned_height
,
aligned_width
,
output
);
aligned_width
,
output
);
}
}
...
@@ -959,7 +944,7 @@ void roi_align_rotated_forward_cuda(Tensor features, Tensor rois, Tensor output,
...
@@ -959,7 +944,7 @@ void roi_align_rotated_forward_cuda(Tensor features, Tensor rois, Tensor output,
void
roi_align_rotated_backward_cuda
(
Tensor
top_grad
,
Tensor
rois
,
void
roi_align_rotated_backward_cuda
(
Tensor
top_grad
,
Tensor
rois
,
Tensor
bottom_grad
,
int
aligned_height
,
Tensor
bottom_grad
,
int
aligned_height
,
int
aligned_width
,
float
spatial_scale
,
int
aligned_width
,
float
spatial_scale
,
int
sampl
e
_ratio
,
bool
aligned
,
int
sampl
ing
_ratio
,
bool
aligned
,
bool
clockwise
)
{
bool
clockwise
)
{
// Number of ROIs
// Number of ROIs
int
num_rois
=
rois
.
size
(
0
);
int
num_rois
=
rois
.
size
(
0
);
...
@@ -972,26 +957,101 @@ void roi_align_rotated_backward_cuda(Tensor top_grad, Tensor rois,
...
@@ -972,26 +957,101 @@ void roi_align_rotated_backward_cuda(Tensor top_grad, Tensor rois,
int
data_height
=
bottom_grad
.
size
(
2
);
int
data_height
=
bottom_grad
.
size
(
2
);
int
data_width
=
bottom_grad
.
size
(
3
);
int
data_width
=
bottom_grad
.
size
(
3
);
ROIAlignRotatedBackwardCUDAKernelLauncher
(
ROIAlignRotatedBackwardCUDAKernelLauncher
(
top_grad
,
rois
,
spatial_scale
,
sampl
e
_ratio
,
aligned
,
clockwise
,
top_grad
,
rois
,
spatial_scale
,
sampl
ing
_ratio
,
aligned
,
clockwise
,
num_channels
,
data_height
,
data_width
,
num_rois
,
aligned_height
,
num_channels
,
data_height
,
data_width
,
num_rois
,
aligned_height
,
aligned_width
,
bottom_grad
);
aligned_width
,
bottom_grad
);
}
}
void
roi_align_rotated_forward_impl
(
Tensor
features
,
Tensor
rois
,
Tensor
output
,
void
roi_align_rotated_forward_impl
(
Tensor
input
,
Tensor
rois
,
Tensor
output
,
int
aligned_height
,
int
aligned_width
,
int
aligned_height
,
int
aligned_width
,
float
spatial_scale
,
int
sampl
e
_ratio
,
float
spatial_scale
,
int
sampl
ing
_ratio
,
bool
aligned
,
bool
clockwise
);
bool
aligned
,
bool
clockwise
);
void
roi_align_rotated_backward_impl
(
Tensor
top_grad
,
Tensor
rois
,
void
roi_align_rotated_backward_impl
(
Tensor
top_grad
,
Tensor
rois
,
Tensor
bottom_grad
,
int
aligned_height
,
Tensor
bottom_grad
,
int
aligned_height
,
int
aligned_width
,
float
spatial_scale
,
int
aligned_width
,
float
spatial_scale
,
int
sampl
e
_ratio
,
bool
aligned
,
int
sampl
ing
_ratio
,
bool
aligned
,
bool
clockwise
);
bool
clockwise
);
REGISTER_DEVICE_IMPL
(
roi_align_rotated_forward_impl
,
CUDA
,
REGISTER_DEVICE_IMPL
(
roi_align_rotated_forward_impl
,
CUDA
,
roi_align_rotated_forward_cuda
);
roi_align_rotated_forward_cuda
);
REGISTER_DEVICE_IMPL
(
roi_align_rotated_backward_impl
,
CUDA
,
REGISTER_DEVICE_IMPL
(
roi_align_rotated_backward_impl
,
CUDA
,
roi_align_rotated_backward_cuda
);
roi_align_rotated_backward_cuda
);
void
RiROIAlignRotatedForwardCUDAKernelLauncher
(
const
at
::
Tensor
features
,
const
at
::
Tensor
rois
,
const
float
spatial_scale
,
const
int
num_samples
,
const
bool
clockwise
,
const
int
channels
,
const
int
height
,
const
int
width
,
const
int
num_rois
,
const
int
pooled_height
,
const
int
pooled_width
,
const
int
num_orientations
,
at
::
Tensor
output
);
void
RiROIAlignRotatedBackwardCUDAKernelLauncher
(
const
at
::
Tensor
top_grad
,
const
at
::
Tensor
rois
,
const
float
spatial_scale
,
const
int
num_samples
,
const
bool
clockwise
,
const
int
channels
,
const
int
height
,
const
int
width
,
const
int
num_rois
,
const
int
pooled_height
,
const
int
pooled_width
,
const
int
num_orientations
,
at
::
Tensor
bottom_grad
);
void
riroi_align_rotated_forward_cuda
(
Tensor
features
,
Tensor
rois
,
Tensor
output
,
int
pooled_height
,
int
pooled_width
,
float
spatial_scale
,
int
num_samples
,
int
num_orientations
,
bool
clockwise
)
{
// Number of ROIs
int
num_rois
=
rois
.
size
(
0
);
int
size_rois
=
rois
.
size
(
1
);
if
(
size_rois
!=
6
)
{
AT_ERROR
(
"wrong roi size"
);
}
CHECK_CONTIGUOUS
(
features
);
CHECK_CONTIGUOUS
(
rois
);
int
num_channels
=
features
.
size
(
1
)
/
num_orientations
;
int
data_height
=
features
.
size
(
2
);
int
data_width
=
features
.
size
(
3
);
RiROIAlignRotatedForwardCUDAKernelLauncher
(
features
,
rois
,
spatial_scale
,
num_samples
,
clockwise
,
num_channels
,
data_height
,
data_width
,
num_rois
,
pooled_height
,
pooled_width
,
num_orientations
,
output
);
}
void
riroi_align_rotated_backward_cuda
(
Tensor
top_grad
,
Tensor
rois
,
Tensor
bottom_grad
,
int
pooled_height
,
int
pooled_width
,
float
spatial_scale
,
int
num_samples
,
int
num_orientations
,
bool
clockwise
)
{
// Number of ROIs
int
num_rois
=
rois
.
size
(
0
);
int
size_rois
=
rois
.
size
(
1
);
if
(
size_rois
!=
6
)
{
AT_ERROR
(
"wrong roi size"
);
}
CHECK_CONTIGUOUS
(
top_grad
);
CHECK_CONTIGUOUS
(
rois
);
int
num_channels
=
bottom_grad
.
size
(
1
)
/
num_orientations
;
int
data_height
=
bottom_grad
.
size
(
2
);
int
data_width
=
bottom_grad
.
size
(
3
);
RiROIAlignRotatedBackwardCUDAKernelLauncher
(
top_grad
,
rois
,
spatial_scale
,
num_samples
,
clockwise
,
num_channels
,
data_height
,
data_width
,
num_rois
,
pooled_height
,
pooled_width
,
num_orientations
,
bottom_grad
);
}
void
riroi_align_rotated_forward_impl
(
Tensor
features
,
Tensor
rois
,
Tensor
output
,
int
pooled_height
,
int
pooled_width
,
float
spatial_scale
,
int
num_samples
,
int
num_orientations
,
bool
clockwise
);
void
riroi_align_rotated_backward_impl
(
Tensor
top_grad
,
Tensor
rois
,
Tensor
bottom_grad
,
int
pooled_height
,
int
pooled_width
,
float
spatial_scale
,
int
num_samples
,
int
num_orientations
,
bool
clockwise
);
REGISTER_DEVICE_IMPL
(
riroi_align_rotated_forward_impl
,
CUDA
,
riroi_align_rotated_forward_cuda
);
REGISTER_DEVICE_IMPL
(
riroi_align_rotated_backward_impl
,
CUDA
,
riroi_align_rotated_backward_cuda
);
void
RoiawarePool3dForwardCUDAKernelLauncher
(
void
RoiawarePool3dForwardCUDAKernelLauncher
(
int
boxes_num
,
int
pts_num
,
int
channels
,
int
max_pts_each_voxel
,
int
out_x
,
int
boxes_num
,
int
pts_num
,
int
channels
,
int
max_pts_each_voxel
,
int
out_x
,
int
out_y
,
int
out_z
,
const
Tensor
rois
,
const
Tensor
pts
,
int
out_y
,
int
out_z
,
const
Tensor
rois
,
const
Tensor
pts
,
...
@@ -1321,6 +1381,12 @@ int HardVoxelizeForwardCUDAKernelLauncher(
...
@@ -1321,6 +1381,12 @@ int HardVoxelizeForwardCUDAKernelLauncher(
const
std
::
vector
<
float
>
coors_range
,
const
int
max_points
,
const
std
::
vector
<
float
>
coors_range
,
const
int
max_points
,
const
int
max_voxels
,
const
int
NDim
=
3
);
const
int
max_voxels
,
const
int
NDim
=
3
);
int
NondeterministicHardVoxelizeForwardCUDAKernelLauncher
(
const
at
::
Tensor
&
points
,
at
::
Tensor
&
voxels
,
at
::
Tensor
&
coors
,
at
::
Tensor
&
num_points_per_voxel
,
const
std
::
vector
<
float
>
voxel_size
,
const
std
::
vector
<
float
>
coors_range
,
const
int
max_points
,
const
int
max_voxels
,
const
int
NDim
=
3
);
void
DynamicVoxelizeForwardCUDAKernelLauncher
(
void
DynamicVoxelizeForwardCUDAKernelLauncher
(
const
at
::
Tensor
&
points
,
at
::
Tensor
&
coors
,
const
at
::
Tensor
&
points
,
at
::
Tensor
&
coors
,
const
std
::
vector
<
float
>
voxel_size
,
const
std
::
vector
<
float
>
coors_range
,
const
std
::
vector
<
float
>
voxel_size
,
const
std
::
vector
<
float
>
coors_range
,
...
@@ -1338,6 +1404,16 @@ int hard_voxelize_forward_cuda(const at::Tensor& points, at::Tensor& voxels,
...
@@ -1338,6 +1404,16 @@ int hard_voxelize_forward_cuda(const at::Tensor& points, at::Tensor& voxels,
max_points
,
max_voxels
,
NDim
);
max_points
,
max_voxels
,
NDim
);
};
};
int
nondeterministic_hard_voxelize_forward_cuda
(
const
at
::
Tensor
&
points
,
at
::
Tensor
&
voxels
,
at
::
Tensor
&
coors
,
at
::
Tensor
&
num_points_per_voxel
,
const
std
::
vector
<
float
>
voxel_size
,
const
std
::
vector
<
float
>
coors_range
,
const
int
max_points
,
const
int
max_voxels
,
const
int
NDim
)
{
return
NondeterministicHardVoxelizeForwardCUDAKernelLauncher
(
points
,
voxels
,
coors
,
num_points_per_voxel
,
voxel_size
,
coors_range
,
max_points
,
max_voxels
,
NDim
);
};
void
dynamic_voxelize_forward_cuda
(
const
at
::
Tensor
&
points
,
at
::
Tensor
&
coors
,
void
dynamic_voxelize_forward_cuda
(
const
at
::
Tensor
&
points
,
at
::
Tensor
&
coors
,
const
std
::
vector
<
float
>
voxel_size
,
const
std
::
vector
<
float
>
voxel_size
,
const
std
::
vector
<
float
>
coors_range
,
const
std
::
vector
<
float
>
coors_range
,
...
@@ -1354,11 +1430,361 @@ int hard_voxelize_forward_impl(const at::Tensor& points, at::Tensor& voxels,
...
@@ -1354,11 +1430,361 @@ int hard_voxelize_forward_impl(const at::Tensor& points, at::Tensor& voxels,
const
int
max_points
,
const
int
max_voxels
,
const
int
max_points
,
const
int
max_voxels
,
const
int
NDim
);
const
int
NDim
);
int
nondeterministic_hard_voxelize_forward_impl
(
const
at
::
Tensor
&
points
,
at
::
Tensor
&
voxels
,
at
::
Tensor
&
coors
,
at
::
Tensor
&
num_points_per_voxel
,
const
std
::
vector
<
float
>
voxel_size
,
const
std
::
vector
<
float
>
coors_range
,
const
int
max_points
,
const
int
max_voxels
,
const
int
NDim
);
void
dynamic_voxelize_forward_impl
(
const
at
::
Tensor
&
points
,
at
::
Tensor
&
coors
,
void
dynamic_voxelize_forward_impl
(
const
at
::
Tensor
&
points
,
at
::
Tensor
&
coors
,
const
std
::
vector
<
float
>
voxel_size
,
const
std
::
vector
<
float
>
voxel_size
,
const
std
::
vector
<
float
>
coors_range
,
const
std
::
vector
<
float
>
coors_range
,
const
int
NDim
);
const
int
NDim
);
REGISTER_DEVICE_IMPL
(
hard_voxelize_forward_impl
,
CUDA
,
REGISTER_DEVICE_IMPL
(
hard_voxelize_forward_impl
,
CUDA
,
hard_voxelize_forward_cuda
);
hard_voxelize_forward_cuda
);
REGISTER_DEVICE_IMPL
(
nondeterministic_hard_voxelize_forward_impl
,
CUDA
,
nondeterministic_hard_voxelize_forward_cuda
);
REGISTER_DEVICE_IMPL
(
dynamic_voxelize_forward_impl
,
CUDA
,
REGISTER_DEVICE_IMPL
(
dynamic_voxelize_forward_impl
,
CUDA
,
dynamic_voxelize_forward_cuda
);
dynamic_voxelize_forward_cuda
);
void
RotatedFeatureAlignForwardCUDAKernelLauncher
(
const
Tensor
features
,
const
Tensor
best_bboxes
,
const
float
spatial_scale
,
const
int
points
,
Tensor
output
);
void
RotatedFeatureAlignBackwardCUDAKernelLauncher
(
const
Tensor
top_grad
,
const
Tensor
best_bboxes
,
const
float
spatial_scale
,
const
int
points
,
Tensor
bottom_grad
);
void
rotated_feature_align_forward_cuda
(
const
Tensor
features
,
const
Tensor
best_bboxes
,
const
float
spatial_scale
,
const
int
points
,
Tensor
output
)
{
RotatedFeatureAlignForwardCUDAKernelLauncher
(
features
,
best_bboxes
,
spatial_scale
,
points
,
output
);
};
void
rotated_feature_align_backward_cuda
(
const
Tensor
top_grad
,
const
Tensor
best_bboxes
,
const
float
spatial_scale
,
const
int
points
,
Tensor
bottom_grad
)
{
RotatedFeatureAlignBackwardCUDAKernelLauncher
(
top_grad
,
best_bboxes
,
spatial_scale
,
points
,
bottom_grad
);
};
void
rotated_feature_align_forward_impl
(
const
Tensor
features
,
const
Tensor
best_bboxes
,
const
float
spatial_scale
,
const
int
points
,
Tensor
output
);
void
rotated_feature_align_backward_impl
(
const
Tensor
top_grad
,
const
Tensor
best_bboxes
,
const
float
spatial_scale
,
const
int
points
,
Tensor
bottom_grad
);
REGISTER_DEVICE_IMPL
(
rotated_feature_align_forward_impl
,
CUDA
,
rotated_feature_align_forward_cuda
);
REGISTER_DEVICE_IMPL
(
rotated_feature_align_backward_impl
,
CUDA
,
rotated_feature_align_backward_cuda
);
void
PointsInPolygonsForwardCUDAKernelLauncher
(
const
at
::
Tensor
points
,
const
at
::
Tensor
polygons
,
const
int
rows
,
const
int
cols
,
at
::
Tensor
output
);
void
points_in_polygons_forward_cuda
(
const
Tensor
points
,
const
Tensor
polygons
,
Tensor
output
,
const
int
rows
,
const
int
cols
)
{
PointsInPolygonsForwardCUDAKernelLauncher
(
points
,
polygons
,
rows
,
cols
,
output
);
};
void
points_in_polygons_forward_impl
(
const
Tensor
points
,
const
Tensor
polygons
,
Tensor
output
,
const
int
rows
,
const
int
cols
);
REGISTER_DEVICE_IMPL
(
points_in_polygons_forward_impl
,
CUDA
,
points_in_polygons_forward_cuda
);
torch
::
Tensor
IndiceMaxpoolForwardCUDAKernelLauncher
(
torch
::
Tensor
features
,
torch
::
Tensor
indicePairs
,
torch
::
Tensor
indiceNum
,
int64_t
numAct
);
torch
::
Tensor
indice_maxpool_forward_cuda
(
torch
::
Tensor
features
,
torch
::
Tensor
indicePairs
,
torch
::
Tensor
indiceNum
,
int64_t
numAct
)
{
return
IndiceMaxpoolForwardCUDAKernelLauncher
(
features
,
indicePairs
,
indiceNum
,
numAct
);
};
torch
::
Tensor
indice_maxpool_forward_impl
(
torch
::
Tensor
features
,
torch
::
Tensor
indicePairs
,
torch
::
Tensor
indiceNum
,
int64_t
numAct
);
REGISTER_DEVICE_IMPL
(
indice_maxpool_forward_impl
,
CUDA
,
indice_maxpool_forward_cuda
);
torch
::
Tensor
IndiceMaxpoolBackwardCUDAKernelLauncher
(
torch
::
Tensor
features
,
torch
::
Tensor
outFeatures
,
torch
::
Tensor
outGrad
,
torch
::
Tensor
indicePairs
,
torch
::
Tensor
indiceNum
);
torch
::
Tensor
indice_maxpool_backward_cuda
(
torch
::
Tensor
features
,
torch
::
Tensor
outFeatures
,
torch
::
Tensor
outGrad
,
torch
::
Tensor
indicePairs
,
torch
::
Tensor
indiceNum
)
{
return
IndiceMaxpoolBackwardCUDAKernelLauncher
(
features
,
outFeatures
,
outGrad
,
indicePairs
,
indiceNum
);
};
torch
::
Tensor
indice_maxpool_backward_impl
(
torch
::
Tensor
features
,
torch
::
Tensor
outFeatures
,
torch
::
Tensor
outGrad
,
torch
::
Tensor
indicePairs
,
torch
::
Tensor
indiceNum
);
REGISTER_DEVICE_IMPL
(
indice_maxpool_backward_impl
,
CUDA
,
indice_maxpool_backward_cuda
)
torch
::
Tensor
IndiceConvForwardCUDAKernelLauncher
(
torch
::
Tensor
features
,
torch
::
Tensor
filters
,
torch
::
Tensor
indicePairs
,
torch
::
Tensor
indiceNum
,
int64_t
numActOut
,
int64_t
_inverse
,
int64_t
_subM
);
torch
::
Tensor
indice_conv_forward_cuda
(
torch
::
Tensor
features
,
torch
::
Tensor
filters
,
torch
::
Tensor
indicePairs
,
torch
::
Tensor
indiceNum
,
int64_t
numActOut
,
int64_t
_inverse
,
int64_t
_subM
)
{
return
IndiceConvForwardCUDAKernelLauncher
(
features
,
filters
,
indicePairs
,
indiceNum
,
numActOut
,
_inverse
,
_subM
);
};
torch
::
Tensor
indice_conv_forward_impl
(
torch
::
Tensor
features
,
torch
::
Tensor
filters
,
torch
::
Tensor
indicePairs
,
torch
::
Tensor
indiceNum
,
int64_t
numActOut
,
int64_t
_inverse
,
int64_t
_subM
);
REGISTER_DEVICE_IMPL
(
indice_conv_forward_impl
,
CUDA
,
indice_conv_forward_cuda
);
std
::
vector
<
torch
::
Tensor
>
IndiceConvBackwardCUDAKernelLauncher
(
torch
::
Tensor
features
,
torch
::
Tensor
filters
,
torch
::
Tensor
outGrad
,
torch
::
Tensor
indicePairs
,
torch
::
Tensor
indiceNum
,
int64_t
_inverse
,
int64_t
_subM
);
std
::
vector
<
torch
::
Tensor
>
indice_conv_backward_cuda
(
torch
::
Tensor
features
,
torch
::
Tensor
filters
,
torch
::
Tensor
outGrad
,
torch
::
Tensor
indicePairs
,
torch
::
Tensor
indiceNum
,
int64_t
_inverse
,
int64_t
_subM
)
{
return
IndiceConvBackwardCUDAKernelLauncher
(
features
,
filters
,
outGrad
,
indicePairs
,
indiceNum
,
_inverse
,
_subM
);
};
std
::
vector
<
torch
::
Tensor
>
indice_conv_backward_impl
(
torch
::
Tensor
features
,
torch
::
Tensor
filters
,
torch
::
Tensor
outGrad
,
torch
::
Tensor
indicePairs
,
torch
::
Tensor
indiceNum
,
int64_t
_inverse
,
int64_t
_subM
);
REGISTER_DEVICE_IMPL
(
indice_conv_backward_impl
,
CUDA
,
indice_conv_backward_cuda
);
torch
::
Tensor
FusedIndiceConvBatchnormCUDAKernelLauncher
(
torch
::
Tensor
features
,
torch
::
Tensor
filters
,
torch
::
Tensor
bias
,
torch
::
Tensor
indicePairs
,
torch
::
Tensor
indiceNum
,
int64_t
numActOut
,
int64_t
_inverse
,
int64_t
_subM
);
torch
::
Tensor
fused_indice_conv_batchnorm_forward_cuda
(
torch
::
Tensor
features
,
torch
::
Tensor
filters
,
torch
::
Tensor
bias
,
torch
::
Tensor
indicePairs
,
torch
::
Tensor
indiceNum
,
int64_t
numActOut
,
int64_t
_inverse
,
int64_t
_subM
)
{
return
FusedIndiceConvBatchnormCUDAKernelLauncher
(
features
,
filters
,
bias
,
indicePairs
,
indiceNum
,
numActOut
,
_inverse
,
_subM
);
};
torch
::
Tensor
fused_indice_conv_batchnorm_forward_impl
(
torch
::
Tensor
features
,
torch
::
Tensor
filters
,
torch
::
Tensor
bias
,
torch
::
Tensor
indicePairs
,
torch
::
Tensor
indiceNum
,
int64_t
numActOut
,
int64_t
_inverse
,
int64_t
_subM
);
REGISTER_DEVICE_IMPL
(
fused_indice_conv_batchnorm_forward_impl
,
CUDA
,
fused_indice_conv_batchnorm_forward_cuda
)
void
MinAreaPolygonsCUDAKernelLauncher
(
const
Tensor
pointsets
,
Tensor
polygons
);
void
min_area_polygons_cuda
(
const
Tensor
pointsets
,
Tensor
polygons
)
{
MinAreaPolygonsCUDAKernelLauncher
(
pointsets
,
polygons
);
}
void
min_area_polygons_impl
(
const
Tensor
pointsets
,
Tensor
polygons
);
REGISTER_DEVICE_IMPL
(
min_area_polygons_impl
,
CUDA
,
min_area_polygons_cuda
);
void
ActiveRotatedFilterForwardCUDAKernelLauncher
(
const
Tensor
input
,
const
Tensor
indices
,
Tensor
output
);
void
ActiveRotatedFilterBackwardCUDAKernelLauncher
(
const
Tensor
grad_out
,
const
Tensor
indices
,
Tensor
grad_in
);
void
active_rotated_filter_forward_cuda
(
const
Tensor
input
,
const
Tensor
indices
,
Tensor
output
)
{
ActiveRotatedFilterForwardCUDAKernelLauncher
(
input
,
indices
,
output
);
};
void
active_rotated_filter_backward_cuda
(
const
Tensor
grad_out
,
const
Tensor
indices
,
Tensor
grad_in
)
{
ActiveRotatedFilterBackwardCUDAKernelLauncher
(
grad_out
,
indices
,
grad_in
);
};
void
active_rotated_filter_forward_impl
(
const
Tensor
input
,
const
Tensor
indices
,
Tensor
output
);
void
active_rotated_filter_backward_impl
(
const
Tensor
grad_out
,
const
Tensor
indices
,
Tensor
grad_in
);
REGISTER_DEVICE_IMPL
(
active_rotated_filter_forward_impl
,
CUDA
,
active_rotated_filter_forward_cuda
);
REGISTER_DEVICE_IMPL
(
active_rotated_filter_backward_impl
,
CUDA
,
active_rotated_filter_backward_cuda
);
void
ConvexIoUCUDAKernelLauncher
(
const
Tensor
pointsets
,
const
Tensor
polygons
,
Tensor
ious
);
void
ConvexGIoUCUDAKernelLauncher
(
const
Tensor
pointsets
,
const
Tensor
polygons
,
Tensor
output
);
void
convex_iou_cuda
(
const
Tensor
pointsets
,
const
Tensor
polygons
,
Tensor
ious
)
{
ConvexIoUCUDAKernelLauncher
(
pointsets
,
polygons
,
ious
);
}
void
convex_giou_cuda
(
const
Tensor
pointsets
,
const
Tensor
polygons
,
Tensor
output
)
{
ConvexGIoUCUDAKernelLauncher
(
pointsets
,
polygons
,
output
);
}
void
convex_iou_impl
(
const
Tensor
pointsets
,
const
Tensor
polygons
,
Tensor
ious
);
void
convex_giou_impl
(
const
Tensor
pointsets
,
const
Tensor
polygons
,
Tensor
output
);
REGISTER_DEVICE_IMPL
(
convex_iou_impl
,
CUDA
,
convex_iou_cuda
);
REGISTER_DEVICE_IMPL
(
convex_giou_impl
,
CUDA
,
convex_giou_cuda
);
Tensor
DiffIoURotatedSortVerticesCUDAKernelLauncher
(
Tensor
vertices
,
Tensor
mask
,
Tensor
num_valid
);
Tensor
diff_iou_rotated_sort_vertices_forward_cuda
(
Tensor
vertices
,
Tensor
mask
,
Tensor
num_valid
)
{
return
DiffIoURotatedSortVerticesCUDAKernelLauncher
(
vertices
,
mask
,
num_valid
);
}
Tensor
diff_iou_rotated_sort_vertices_forward_impl
(
Tensor
vertices
,
Tensor
mask
,
Tensor
num_valid
);
REGISTER_DEVICE_IMPL
(
diff_iou_rotated_sort_vertices_forward_impl
,
CUDA
,
diff_iou_rotated_sort_vertices_forward_cuda
);
void
ChamferDistanceForwardCUDAKernelLauncher
(
const
Tensor
xyz1
,
const
Tensor
xyz2
,
const
Tensor
dist1
,
const
Tensor
dist2
,
const
Tensor
idx1
,
const
Tensor
idx2
);
void
ChamferDistanceBackwardCUDAKernelLauncher
(
const
Tensor
xyz1
,
const
Tensor
xyz2
,
Tensor
grad_xyz1
,
Tensor
grad_xyz2
,
Tensor
grad_dist1
,
Tensor
grad_dist2
,
Tensor
idx1
,
Tensor
idx2
);
void
chamfer_distance_forward_cuda
(
const
Tensor
xyz1
,
const
Tensor
xyz2
,
const
Tensor
dist1
,
const
Tensor
dist2
,
const
Tensor
idx1
,
const
Tensor
idx2
)
{
ChamferDistanceForwardCUDAKernelLauncher
(
xyz1
,
xyz2
,
dist1
,
dist2
,
idx1
,
idx2
);
};
void
chamfer_distance_backward_cuda
(
const
Tensor
xyz1
,
const
Tensor
xyz2
,
Tensor
gradxyz1
,
Tensor
gradxyz2
,
Tensor
graddist1
,
Tensor
graddist2
,
Tensor
idx1
,
Tensor
idx2
)
{
ChamferDistanceBackwardCUDAKernelLauncher
(
xyz1
,
xyz2
,
gradxyz1
,
gradxyz2
,
graddist1
,
graddist2
,
idx1
,
idx2
);
};
void
chamfer_distance_forward_impl
(
const
Tensor
xyz1
,
const
Tensor
xyz2
,
const
Tensor
dist1
,
const
Tensor
dist2
,
const
Tensor
idx1
,
const
Tensor
idx2
);
void
chamfer_distance_backward_impl
(
const
Tensor
xyz1
,
const
Tensor
xyz2
,
Tensor
gradxyz1
,
Tensor
gradxyz2
,
Tensor
graddist1
,
Tensor
graddist2
,
Tensor
idx1
,
Tensor
idx2
);
REGISTER_DEVICE_IMPL
(
chamfer_distance_forward_impl
,
CUDA
,
chamfer_distance_forward_cuda
);
REGISTER_DEVICE_IMPL
(
chamfer_distance_backward_impl
,
CUDA
,
chamfer_distance_backward_cuda
);
void
PrROIPoolForwardCUDAKernelLauncher
(
Tensor
input
,
Tensor
rois
,
Tensor
output
,
int
pooled_height
,
int
pooled_width
,
float
spatial_scale
);
void
PrROIPoolBackwardCUDAKernelLauncher
(
Tensor
grad_output
,
Tensor
rois
,
Tensor
grad_input
,
int
pooled_height
,
int
pooled_width
,
float
spatial_scale
);
void
PrROIPoolCoorBackwardCUDAKernelLauncher
(
Tensor
output
,
Tensor
grad_output
,
Tensor
input
,
Tensor
rois
,
Tensor
grad_rois
,
int
pooled_height
,
int
pooled_width
,
float
spatial_scale
);
void
prroi_pool_forward_cuda
(
Tensor
input
,
Tensor
rois
,
Tensor
output
,
int
pooled_height
,
int
pooled_width
,
float
spatial_scale
)
{
PrROIPoolForwardCUDAKernelLauncher
(
input
,
rois
,
output
,
pooled_height
,
pooled_width
,
spatial_scale
);
}
void
prroi_pool_backward_cuda
(
Tensor
grad_output
,
Tensor
rois
,
Tensor
grad_input
,
int
pooled_height
,
int
pooled_width
,
float
spatial_scale
)
{
PrROIPoolBackwardCUDAKernelLauncher
(
grad_output
,
rois
,
grad_input
,
pooled_height
,
pooled_width
,
spatial_scale
);
}
void
prroi_pool_coor_backward_cuda
(
Tensor
output
,
Tensor
grad_output
,
Tensor
input
,
Tensor
rois
,
Tensor
grad_rois
,
int
pooled_height
,
int
pooled_width
,
float
spatial_scale
)
{
PrROIPoolCoorBackwardCUDAKernelLauncher
(
output
,
grad_output
,
input
,
rois
,
grad_rois
,
pooled_height
,
pooled_width
,
spatial_scale
);
}
void
prroi_pool_forward_impl
(
Tensor
input
,
Tensor
rois
,
Tensor
output
,
int
pooled_height
,
int
pooled_width
,
float
spatial_scale
);
void
prroi_pool_backward_impl
(
Tensor
grad_output
,
Tensor
rois
,
Tensor
grad_input
,
int
pooled_height
,
int
pooled_width
,
float
spatial_scale
);
void
prroi_pool_coor_backward_impl
(
Tensor
output
,
Tensor
grad_output
,
Tensor
input
,
Tensor
rois
,
Tensor
grad_rois
,
int
pooled_height
,
int
pooled_width
,
float
spatial_scale
);
REGISTER_DEVICE_IMPL
(
prroi_pool_forward_impl
,
CUDA
,
prroi_pool_forward_cuda
);
REGISTER_DEVICE_IMPL
(
prroi_pool_backward_impl
,
CUDA
,
prroi_pool_backward_cuda
);
REGISTER_DEVICE_IMPL
(
prroi_pool_coor_backward_impl
,
CUDA
,
prroi_pool_coor_backward_cuda
);
mmcv/ops/csrc/pytorch/cuda/diff_iou_rotated_cuda.cu
0 → 100644
View file @
fdeee889
// Copyright (c) OpenMMLab. All rights reserved
// Adapted from
// https://github.com/lilanxiao/Rotated_IoU/cuda_op/sort_vert_kernel.cu # noqa
#include "diff_iou_rotated_cuda_kernel.cuh"
#include "pytorch_cpp_helper.hpp"
#include "pytorch_cuda_helper.hpp"
at
::
Tensor
DiffIoURotatedSortVerticesCUDAKernelLauncher
(
at
::
Tensor
vertices
,
at
::
Tensor
mask
,
at
::
Tensor
num_valid
)
{
at
::
cuda
::
CUDAGuard
device_guard
(
vertices
.
device
());
cudaStream_t
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
CHECK_CONTIGUOUS
(
vertices
);
CHECK_CONTIGUOUS
(
mask
);
CHECK_CONTIGUOUS
(
num_valid
);
CHECK_CUDA
(
vertices
);
CHECK_CUDA
(
mask
);
CHECK_CUDA
(
num_valid
);
int
b
=
vertices
.
size
(
0
);
int
n
=
vertices
.
size
(
1
);
int
m
=
vertices
.
size
(
2
);
at
::
Tensor
idx
=
torch
::
zeros
({
b
,
n
,
MAX_NUM_VERT_IDX
},
at
::
device
(
vertices
.
device
()).
dtype
(
at
::
ScalarType
::
Int
));
diff_iou_rotated_sort_vertices_forward_cuda_kernel
<<<
b
,
opt_n_thread
(
n
),
0
,
stream
>>>
(
b
,
n
,
m
,
vertices
.
data_ptr
<
float
>
(),
mask
.
data_ptr
<
bool
>
(),
num_valid
.
data_ptr
<
int
>
(),
idx
.
data_ptr
<
int
>
());
AT_CUDA_CHECK
(
cudaGetLastError
());
return
idx
;
}
mmcv/ops/csrc/pytorch/cuda/fused_spconv_ops_cuda.cu
0 → 100644
View file @
fdeee889
#include <cuda_runtime_api.h>
#include <torch/script.h>
#include <utils/spconv/spconv/indice.h>
#include <utils/spconv/spconv/reordering.h>
#include "../spconv_utils.h"
#include "pytorch_cuda_helper.hpp"
torch
::
Tensor
FusedIndiceConvBatchnormCUDAKernelLauncher
(
torch
::
Tensor
features
,
torch
::
Tensor
filters
,
torch
::
Tensor
bias
,
torch
::
Tensor
indicePairs
,
torch
::
Tensor
indiceNum
,
int64_t
numActOut
,
int64_t
_inverse
,
int64_t
_subM
)
{
at
::
cuda
::
CUDAGuard
device_guard
(
features
.
device
());
bool
subM
=
_subM
!=
0
;
bool
inverse
=
_inverse
!=
0
;
auto
device
=
features
.
device
().
type
();
auto
ndim
=
filters
.
dim
()
-
2
;
auto
kernelVolume
=
indicePairs
.
size
(
0
);
auto
numInPlanes
=
features
.
size
(
1
);
auto
numOutPlanes
=
filters
.
size
(
ndim
+
1
);
auto
indicePairNumCpu
=
indiceNum
.
to
({
torch
::
kCPU
});
auto
indicePairMaxSizeIter
=
std
::
max_element
(
indicePairNumCpu
.
data_ptr
<
int
>
(),
indicePairNumCpu
.
data_ptr
<
int
>
()
+
kernelVolume
);
int
indicePairMaxOffset
=
indicePairMaxSizeIter
-
indicePairNumCpu
.
data_ptr
<
int
>
();
int
indicePairMaxSize
=
*
indicePairMaxSizeIter
;
auto
options
=
torch
::
TensorOptions
().
dtype
(
features
.
dtype
()).
device
(
features
.
device
());
torch
::
Tensor
output
=
torch
::
zeros
({
numActOut
,
numOutPlanes
},
options
).
copy_
(
bias
);
torch
::
Tensor
inputBuffer
=
torch
::
zeros
({
indicePairMaxSize
,
numInPlanes
},
options
);
torch
::
Tensor
outputBuffer
=
torch
::
zeros
({
indicePairMaxSize
,
numOutPlanes
},
options
);
filters
=
filters
.
view
({
-
1
,
numInPlanes
,
numOutPlanes
});
if
(
subM
)
{
// the center index of subm conv don't need gather and scatter
// add.
torch
::
mm_out
(
output
,
features
,
filters
[
indicePairMaxOffset
]);
}
double
totalGatherTime
=
0
;
double
totalGEMMTime
=
0
;
double
totalSAddTime
=
0
;
for
(
int
i
=
0
;
i
<
kernelVolume
;
++
i
)
{
auto
nHot
=
indicePairNumCpu
.
data_ptr
<
int
>
()[
i
];
if
(
nHot
<=
0
||
(
subM
&&
i
==
indicePairMaxOffset
))
{
continue
;
}
AT_DISPATCH_FLOATING_TYPES_AND_HALF
(
features
.
scalar_type
(),
"FusedIndiceConvBatchnormKernel"
,
[
&
]
{
auto
outputBufferBlob
=
torch
::
from_blob
(
outputBuffer
.
data_ptr
<
scalar_t
>
(),
{
nHot
,
numOutPlanes
},
options
);
auto
inputBufferBlob
=
torch
::
from_blob
(
inputBuffer
.
data_ptr
<
scalar_t
>
(),
{
nHot
,
numInPlanes
},
options
);
if
(
device
==
torch
::
kCPU
)
{
functor
::
SparseGatherFunctor
<
tv
::
CPU
,
scalar_t
,
int
>
gatherFtor
;
gatherFtor
(
tv
::
CPU
(),
tv
::
torch2tv
<
scalar_t
>
(
inputBuffer
),
tv
::
torch2tv
<
const
scalar_t
>
(
features
),
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
inverse
),
nHot
);
}
else
{
functor
::
SparseGatherFunctor
<
tv
::
TorchGPU
,
scalar_t
,
int
>
gatherFtor
;
gatherFtor
(
tv
::
TorchGPU
(),
tv
::
torch2tv
<
scalar_t
>
(
inputBuffer
),
tv
::
torch2tv
<
const
scalar_t
>
(
features
),
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
inverse
),
nHot
);
TV_CHECK_CUDA_ERR
();
/* slower than SparseGatherFunctor, may due to int->long conversion
auto indicePairLong = indicePairs[i][inverse].to(torch::kInt64);
auto indicePairBlob =
torch::from_blob(indicePairLong.data_ptr<long>(), {nHot},
indicePairOptions); torch::index_select_out(inputBufferBlob,
features, 0, indicePairBlob);*/
}
torch
::
mm_out
(
outputBufferBlob
,
inputBufferBlob
,
filters
[
i
]);
if
(
device
==
torch
::
kCPU
)
{
functor
::
SparseScatterAddFunctor
<
tv
::
CPU
,
scalar_t
,
int
>
scatterFtor
;
scatterFtor
(
tv
::
CPU
(),
tv
::
torch2tv
<
scalar_t
>
(
output
),
tv
::
torch2tv
<
const
scalar_t
>
(
outputBuffer
),
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
!
inverse
),
nHot
,
true
);
}
else
{
functor
::
SparseScatterAddFunctor
<
tv
::
TorchGPU
,
scalar_t
,
int
>
scatterFtor
;
scatterFtor
(
tv
::
TorchGPU
(),
tv
::
torch2tv
<
scalar_t
>
(
output
),
tv
::
torch2tv
<
const
scalar_t
>
(
outputBuffer
),
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
!
inverse
),
nHot
,
true
);
TV_CHECK_CUDA_ERR
();
}
});
}
return
output
;
}
Prev
1
…
13
14
15
16
17
18
19
20
21
…
23
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment