Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
SOLOv2-pytorch
Commits
b04a0157
Commit
b04a0157
authored
Sep 23, 2018
by
Kai Chen
Browse files
add fp16 support for forwarding
parent
7f9d2eb5
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
61 additions
and
98 deletions
+61
-98
mmdet/ops/roi_align/src/roi_align_kernel.cu
mmdet/ops/roi_align/src/roi_align_kernel.cu
+40
-52
mmdet/ops/roi_pool/gradcheck.py
mmdet/ops/roi_pool/gradcheck.py
+1
-1
mmdet/ops/roi_pool/src/roi_pool_kernel.cu
mmdet/ops/roi_pool/src/roi_pool_kernel.cu
+20
-45
No files found.
mmdet/ops/roi_align/src/roi_align_kernel.cu
View file @
b04a0157
#include <ATen/ATen.h>
#include <THC/THCAtomics.cuh>
#include <cuda.h>
#include <cuda_runtime.h>
using
namespace
at
;
// temporal fix for pytorch<=0.4.1 (see #9848)
#include <math.h>
#include <stdio.h>
#include <vector>
#define CUDA_1D_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
#define CUDA_1D_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
i += blockDim.x * gridDim.x)
#define THREADS_PER_BLOCK 1024
...
...
@@ -28,10 +24,8 @@ __device__ scalar_t bilinear_interpolate(const scalar_t *bottom_data,
return
0
;
}
if
(
y
<=
0
)
y
=
0
;
if
(
x
<=
0
)
x
=
0
;
if
(
y
<=
0
)
y
=
0
;
if
(
x
<=
0
)
x
=
0
;
int
y_low
=
(
int
)
y
;
int
x_low
=
(
int
)
x
;
...
...
@@ -69,12 +63,13 @@ __device__ scalar_t bilinear_interpolate(const scalar_t *bottom_data,
}
template
<
typename
scalar_t
>
__global__
void
ROIAlignForward
(
const
int
nthreads
,
const
scalar_t
*
bottom_data
,
const
scalar_t
*
bottom_rois
,
const
scalar_t
spatial_scale
,
const
int
sample_num
,
const
int
channels
,
const
int
height
,
const
int
width
,
const
int
pooled_height
,
const
int
pooled_width
,
scalar_t
*
top_data
)
{
__global__
void
ROIAlignForward
(
const
int
nthreads
,
const
scalar_t
*
bottom_data
,
const
scalar_t
*
bottom_rois
,
const
scalar_t
spatial_scale
,
const
int
sample_num
,
const
int
channels
,
const
int
height
,
const
int
width
,
const
int
pooled_height
,
const
int
pooled_width
,
scalar_t
*
top_data
)
{
CUDA_1D_KERNEL_LOOP
(
index
,
nthreads
)
{
// (n, c, ph, pw) is an element in the aligned output
int
pw
=
index
%
pooled_width
;
...
...
@@ -101,7 +96,7 @@ ROIAlignForward(const int nthreads, const scalar_t *bottom_data,
int
sample_num_h
=
(
sample_num
>
0
)
?
sample_num
:
ceil
(
roi_height
/
pooled_height
);
// e.g., = 2
:
ceil
(
roi_height
/
pooled_height
);
// e.g., = 2
int
sample_num_w
=
(
sample_num
>
0
)
?
sample_num
:
ceil
(
roi_width
/
pooled_width
);
...
...
@@ -137,17 +132,17 @@ int ROIAlignForwardLaucher(const at::Tensor features, const at::Tensor rois,
const
int
pooled_height
,
const
int
pooled_width
,
at
::
Tensor
output
)
{
const
int
output_size
=
num_rois
*
pooled_height
*
pooled_width
*
channels
;
AT_DISPATCH_FLOATING_TYPES
(
AT_DISPATCH_FLOATING_TYPES
_AND_HALF
(
features
.
type
(),
"ROIAlignLaucherForward"
,
([
&
]
{
const
scalar_t
*
bottom_data
=
features
.
data
<
scalar_t
>
();
const
scalar_t
*
rois_data
=
rois
.
data
<
scalar_t
>
();
scalar_t
*
top_data
=
output
.
data
<
scalar_t
>
();
ROIAlignForward
<
scalar_t
>
<<<
GET_BLOCKS
(
output_size
),
THREADS_PER_BLOCK
>>>
(
output_size
,
bottom_data
,
rois_data
,
scalar_t
(
spatial_scale
),
sample_num
,
channels
,
height
,
width
,
pooled_height
,
pooled_width
,
top_data
);
ROIAlignForward
<
scalar_t
>
<<<
GET_BLOCKS
(
output_size
),
THREADS_PER_BLOCK
>>>
(
output_size
,
bottom_data
,
rois_data
,
scalar_t
(
spatial_scale
),
sample_num
,
channels
,
height
,
width
,
pooled_height
,
pooled_width
,
top_data
);
}));
cudaError_t
err
=
cudaGetLastError
();
if
(
cudaSuccess
!=
err
)
{
...
...
@@ -159,11 +154,12 @@ int ROIAlignForwardLaucher(const at::Tensor features, const at::Tensor rois,
}
template
<
typename
scalar_t
>
__device__
void
bilinear_interpolate_gradient
(
const
int
height
,
const
int
width
,
scalar_t
y
,
scalar_t
x
,
scalar_t
&
w1
,
scalar_t
&
w2
,
scalar_t
&
w3
,
scalar_t
&
w4
,
int
&
x_low
,
int
&
x_high
,
int
&
y_low
,
int
&
y_high
)
{
__device__
void
bilinear_interpolate_gradient
(
const
int
height
,
const
int
width
,
scalar_t
y
,
scalar_t
x
,
scalar_t
&
w1
,
scalar_t
&
w2
,
scalar_t
&
w3
,
scalar_t
&
w4
,
int
&
x_low
,
int
&
x_high
,
int
&
y_low
,
int
&
y_high
)
{
// deal with cases that inverse elements are out of feature map boundary
if
(
y
<
-
1.0
||
y
>
height
||
x
<
-
1.0
||
x
>
width
)
{
w1
=
w2
=
w3
=
w4
=
0.
;
...
...
@@ -171,10 +167,8 @@ bilinear_interpolate_gradient(const int height, const int width, scalar_t y,
return
;
}
if
(
y
<=
0
)
y
=
0
;
if
(
x
<=
0
)
x
=
0
;
if
(
y
<=
0
)
y
=
0
;
if
(
x
<=
0
)
x
=
0
;
y_low
=
(
int
)
y
;
x_low
=
(
int
)
x
;
...
...
@@ -204,12 +198,11 @@ bilinear_interpolate_gradient(const int height, const int width, scalar_t y,
}
template
<
typename
scalar_t
>
__global__
void
ROIAlignBackward
(
const
int
nthreads
,
const
scalar_t
*
top_diff
,
const
scalar_t
*
bottom_rois
,
const
scalar_t
spatial_scale
,
const
int
sample_num
,
const
int
channels
,
const
int
height
,
const
int
width
,
const
int
pooled_height
,
const
int
pooled_width
,
scalar_t
*
bottom_diff
)
{
__global__
void
ROIAlignBackward
(
const
int
nthreads
,
const
scalar_t
*
top_diff
,
const
scalar_t
*
bottom_rois
,
const
scalar_t
spatial_scale
,
const
int
sample_num
,
const
int
channels
,
const
int
height
,
const
int
width
,
const
int
pooled_height
,
const
int
pooled_width
,
scalar_t
*
bottom_diff
)
{
CUDA_1D_KERNEL_LOOP
(
index
,
nthreads
)
{
// (n, c, ph, pw) is an element in the aligned output
int
pw
=
index
%
pooled_width
;
...
...
@@ -239,7 +232,7 @@ ROIAlignBackward(const int nthreads, const scalar_t *top_diff,
int
sample_num_h
=
(
sample_num
>
0
)
?
sample_num
:
ceil
(
roi_height
/
pooled_height
);
// e.g., = 2
:
ceil
(
roi_height
/
pooled_height
);
// e.g., = 2
int
sample_num_w
=
(
sample_num
>
0
)
?
sample_num
:
ceil
(
roi_width
/
pooled_width
);
...
...
@@ -279,13 +272,6 @@ ROIAlignBackward(const int nthreads, const scalar_t *top_diff,
}
}
template
<
>
__global__
void
ROIAlignBackward
<
double
>
(
const
int
nthreads
,
const
double
*
top_diff
,
const
double
*
bottom_rois
,
const
double
spatial_scale
,
const
int
sample_num
,
const
int
channels
,
const
int
height
,
const
int
width
,
const
int
pooled_height
,
const
int
pooled_width
,
double
*
bottom_diff
)
{}
int
ROIAlignBackwardLaucher
(
const
at
::
Tensor
top_grad
,
const
at
::
Tensor
rois
,
const
float
spatial_scale
,
const
int
sample_num
,
const
int
channels
,
const
int
height
,
...
...
@@ -294,6 +280,7 @@ int ROIAlignBackwardLaucher(const at::Tensor top_grad, const at::Tensor rois,
at
::
Tensor
bottom_grad
)
{
const
int
output_size
=
num_rois
*
pooled_height
*
pooled_width
*
channels
;
// TODO: use AT_DISPATCH_FLOATING_TYPES_AND_HALF when atomicAdd is resolved
AT_DISPATCH_FLOATING_TYPES
(
top_grad
.
type
(),
"ROIAlignLaucherBackward"
,
([
&
]
{
const
scalar_t
*
top_diff
=
top_grad
.
data
<
scalar_t
>
();
...
...
@@ -304,10 +291,11 @@ int ROIAlignBackwardLaucher(const at::Tensor top_grad, const at::Tensor rois,
exit
(
-
1
);
}
ROIAlignBackward
<
scalar_t
><<<
GET_BLOCKS
(
output_size
),
THREADS_PER_BLOCK
>>>
(
output_size
,
top_diff
,
rois_data
,
spatial_scale
,
sample_num
,
channels
,
height
,
width
,
pooled_height
,
pooled_width
,
bottom_diff
);
ROIAlignBackward
<
scalar_t
>
<<<
GET_BLOCKS
(
output_size
),
THREADS_PER_BLOCK
>>>
(
output_size
,
top_diff
,
rois_data
,
spatial_scale
,
sample_num
,
channels
,
height
,
width
,
pooled_height
,
pooled_width
,
bottom_diff
);
}));
cudaError_t
err
=
cudaGetLastError
();
if
(
cudaSuccess
!=
err
)
{
...
...
mmdet/ops/roi_pool/gradcheck.py
View file @
b04a0157
...
...
@@ -4,7 +4,7 @@ from torch.autograd import gradcheck
import
os.path
as
osp
import
sys
sys
.
path
.
append
(
osp
.
abspath
(
osp
.
join
(
__file__
,
'../../'
)))
from
roi_pool
ing
import
RoIPool
from
roi_pool
import
RoIPool
feat
=
torch
.
randn
(
4
,
16
,
15
,
15
,
requires_grad
=
True
).
cuda
()
rois
=
torch
.
Tensor
([[
0
,
0
,
0
,
50
,
50
],
[
0
,
10
,
30
,
43
,
55
],
...
...
mmdet/ops/roi_pool/src/roi_pool_kernel.cu
View file @
b04a0157
#include <ATen/ATen.h>
#include <THC/THCAtomics.cuh>
#include <cuda.h>
#include <cuda_runtime.h>
using
namespace
at
;
// temporal fix for pytorch<=0.4.1 (see #9848)
#include <math.h>
#include <stdio.h>
#include <vector>
#define CUDA_1D_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
#define CUDA_1D_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
i += blockDim.x * gridDim.x)
#define THREADS_PER_BLOCK 1024
...
...
@@ -44,8 +40,7 @@ __global__ void ROIPoolForward(const int nthreads, const scalar_t *bottom_data,
// force malformed rois to be 1x1
scalar_t
roi_w
=
roi_x2
-
roi_x1
;
scalar_t
roi_h
=
roi_y2
-
roi_y1
;
if
(
roi_w
<=
0
||
roi_h
<=
0
)
continue
;
if
(
roi_w
<=
0
||
roi_h
<=
0
)
continue
;
scalar_t
bin_size_w
=
roi_w
/
static_cast
<
scalar_t
>
(
pooled_w
);
scalar_t
bin_size_h
=
roi_h
/
static_cast
<
scalar_t
>
(
pooled_h
);
...
...
@@ -68,7 +63,8 @@ __global__ void ROIPoolForward(const int nthreads, const scalar_t *bottom_data,
bottom_data
+=
(
roi_batch_ind
*
channels
+
c
)
*
height
*
width
;
// Define an empty pooling region to be zero
scalar_t
max_val
=
is_empty
?
0
:
bottom_data
[
bin_y1
*
width
+
bin_x1
]
-
1
;
scalar_t
max_val
=
is_empty
?
static_cast
<
scalar_t
>
(
0
)
:
bottom_data
[
bin_y1
*
width
+
bin_x1
]
-
1
;
for
(
int
h
=
bin_y1
;
h
<
bin_y2
;
++
h
)
{
for
(
int
w
=
bin_x1
;
w
<
bin_x2
;
++
w
)
{
...
...
@@ -80,8 +76,7 @@ __global__ void ROIPoolForward(const int nthreads, const scalar_t *bottom_data,
}
}
top_data
[
index
]
=
max_val
;
if
(
argmax_data
!=
NULL
)
argmax_data
[
index
]
=
max_idx
;
if
(
argmax_data
!=
NULL
)
argmax_data
[
index
]
=
max_idx
;
}
}
...
...
@@ -92,17 +87,18 @@ int ROIPoolForwardLaucher(const at::Tensor features, const at::Tensor rois,
at
::
Tensor
output
,
at
::
Tensor
argmax
)
{
const
int
output_size
=
num_rois
*
channels
*
pooled_h
*
pooled_w
;
AT_DISPATCH_FLOATING_TYPES
(
AT_DISPATCH_FLOATING_TYPES
_AND_HALF
(
features
.
type
(),
"ROIPoolLaucherForward"
,
([
&
]
{
const
scalar_t
*
bottom_data
=
features
.
data
<
scalar_t
>
();
const
scalar_t
*
rois_data
=
rois
.
data
<
scalar_t
>
();
scalar_t
*
top_data
=
output
.
data
<
scalar_t
>
();
int
*
argmax_data
=
argmax
.
data
<
int
>
();
ROIPoolForward
<
scalar_t
><<<
GET_BLOCKS
(
output_size
),
THREADS_PER_BLOCK
>>>
(
output_size
,
bottom_data
,
rois_data
,
scalar_t
(
spatial_scale
),
channels
,
height
,
width
,
pooled_h
,
pooled_w
,
top_data
,
argmax_data
);
ROIPoolForward
<
scalar_t
>
<<<
GET_BLOCKS
(
output_size
),
THREADS_PER_BLOCK
>>>
(
output_size
,
bottom_data
,
rois_data
,
scalar_t
(
spatial_scale
),
channels
,
height
,
width
,
pooled_h
,
pooled_w
,
top_data
,
argmax_data
);
}));
cudaError_t
err
=
cudaGetLastError
();
if
(
cudaSuccess
!=
err
)
{
...
...
@@ -135,28 +131,6 @@ __global__ void ROIPoolBackward(const int nthreads, const scalar_t *top_diff,
}
}
template
<
>
__global__
void
ROIPoolBackward
<
double
>
(
const
int
nthreads
,
const
double
*
top_diff
,
const
double
*
rois
,
const
int
*
argmax_data
,
const
double
spatial_scale
,
const
int
channels
,
const
int
height
,
const
int
width
,
const
int
pooled_h
,
const
int
pooled_w
,
double
*
bottom_diff
)
{
// CUDA_1D_KERNEL_LOOP(index, nthreads) {
// int pw = index % pooled_w;
// int ph = (index / pooled_w) % pooled_h;
// int c = (index / pooled_w / pooled_h) % channels;
// int n = index / pooled_w / pooled_h / channels;
// int roi_batch_ind = rois[n * 5];
// int bottom_index = argmax_data[(n * channels + c) * pooled_h * pooled_w +
// ph * pooled_w + pw];
// *(bottom_diff + (roi_batch_ind * channels + c) * height * width +
// bottom_index) +=top_diff[index];
// }
}
int
ROIPoolBackwardLaucher
(
const
at
::
Tensor
top_grad
,
const
at
::
Tensor
rois
,
const
at
::
Tensor
argmax
,
const
float
spatial_scale
,
const
int
batch_size
,
const
int
channels
,
...
...
@@ -165,6 +139,7 @@ int ROIPoolBackwardLaucher(const at::Tensor top_grad, const at::Tensor rois,
const
int
pooled_w
,
at
::
Tensor
bottom_grad
)
{
const
int
output_size
=
num_rois
*
pooled_h
*
pooled_w
*
channels
;
// TODO: use AT_DISPATCH_FLOATING_TYPES_AND_HALF when atomicAdd is resolved
AT_DISPATCH_FLOATING_TYPES
(
top_grad
.
type
(),
"ROIPoolLaucherBackward"
,
([
&
]
{
const
scalar_t
*
top_diff
=
top_grad
.
data
<
scalar_t
>
();
...
...
@@ -177,11 +152,11 @@ int ROIPoolBackwardLaucher(const at::Tensor top_grad, const at::Tensor rois,
exit
(
-
1
);
}
ROIPoolBackward
<
scalar_t
>
<<<
GET_BLOCKS
(
output_size
),
THREADS_PER_BLOCK
>>>
(
output_size
,
top_diff
,
rois_data
,
argmax_data
,
scalar_t
(
spatial_scale
),
channels
,
height
,
width
,
pooled_h
,
pooled_w
,
bottom_diff
);
ROIPoolBackward
<
scalar_t
>
<<<
GET_BLOCKS
(
output_size
),
THREADS_PER_BLOCK
>>>
(
output_size
,
top_diff
,
rois_data
,
argmax_data
,
scalar_t
(
spatial_scale
),
channels
,
height
,
width
,
pooled_h
,
pooled_w
,
bottom_diff
);
}));
cudaError_t
err
=
cudaGetLastError
();
if
(
cudaSuccess
!=
err
)
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment