Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
apex
Commits
5698eeeb
Commit
5698eeeb
authored
Apr 14, 2022
by
Thor Johnsen
Browse files
Bit faster
parent
140282d5
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
60 additions
and
37 deletions
+60
-37
apex/contrib/bottleneck/bottleneck.py
apex/contrib/bottleneck/bottleneck.py
+16
-6
apex/contrib/csrc/bottleneck/bottleneck.cpp
apex/contrib/csrc/bottleneck/bottleneck.cpp
+44
-31
No files found.
apex/contrib/bottleneck/bottleneck.py
View file @
5698eeeb
...
...
@@ -448,14 +448,11 @@ class SpatialBottleneckFunction(torch.autograd.Function):
t_list
.
append
(
ctx
.
saved_tensors
[
10
])
grads
=
fast_bottleneck
.
backward_init
(
ctx
.
explicit_nhwc
,
ctx
.
stride_1x1
,
t_list
)
wgrad3_stream
=
torch
.
cuda
.
Stream
()
wgrad3_stream
.
wait_stream
(
torch
.
cuda
.
current_stream
())
grad_out2
=
fast_bottleneck
.
backward_grad_out2
(
ctx
.
explicit_nhwc
,
ctx
.
stride_1x1
,
t_list
,
grads
)
wgrad2_stream
=
torch
.
cuda
.
Stream
()
wgrad2_stream
.
wait_stream
(
torch
.
cuda
.
current_stream
())
with
torch
.
cuda
.
stream
(
wgrad2_stream
):
if
ctx
.
spatial_group_size
>
1
:
wgrad2
=
fast_bottleneck
.
backward_wgrad2_pad
(
ctx
.
explicit_nhwc
,
ctx
.
stride_1x1
,
t_list
,
grads
,
out1_pad
,
grad_out2
)
else
:
wgrad2
=
fast_bottleneck
.
backward_wgrad2
(
ctx
.
explicit_nhwc
,
ctx
.
stride_1x1
,
t_list
,
grads
,
grad_out2
)
# do halo exchange of grad_out2 here
# compute halo cells for grad_out1
if
ctx
.
spatial_group_size
>
1
:
...
...
@@ -576,8 +573,21 @@ class SpatialBottleneckFunction(torch.autograd.Function):
if
ctx
.
spatial_group_rank
>
0
:
torch
.
cuda
.
current_stream
().
wait_stream
(
ctx
.
stream1
)
fast_bottleneck
.
backward_rest
(
ctx
.
explicit_nhwc
,
ctx
.
stride_1x1
,
t_list
,
grads
,
grad_out2
,
grad_out1
,
wgrad2
)
wgrad1_stream
=
torch
.
cuda
.
Stream
()
wgrad1_stream
.
wait_stream
(
torch
.
cuda
.
current_stream
())
fast_bottleneck
.
backward_rest
(
ctx
.
explicit_nhwc
,
ctx
.
stride_1x1
,
t_list
,
grads
,
grad_out2
,
grad_out1
)
with
torch
.
cuda
.
stream
(
wgrad3_stream
):
fast_bottleneck
.
backward_wgrad3
(
ctx
.
explicit_nhwc
,
ctx
.
stride_1x1
,
t_list
,
grads
)
with
torch
.
cuda
.
stream
(
wgrad2_stream
):
if
ctx
.
spatial_group_size
>
1
:
fast_bottleneck
.
backward_wgrad2_pad
(
ctx
.
explicit_nhwc
,
ctx
.
stride_1x1
,
t_list
,
grads
,
out1_pad
,
grad_out2
)
else
:
fast_bottleneck
.
backward_wgrad2
(
ctx
.
explicit_nhwc
,
ctx
.
stride_1x1
,
t_list
,
grads
,
grad_out2
)
with
torch
.
cuda
.
stream
(
wgrad1_stream
):
fast_bottleneck
.
backward_wgrad1
(
ctx
.
explicit_nhwc
,
ctx
.
stride_1x1
,
t_list
,
grads
,
grad_out1
)
torch
.
cuda
.
current_stream
().
wait_stream
(
wgrad3_stream
)
torch
.
cuda
.
current_stream
().
wait_stream
(
wgrad2_stream
)
torch
.
cuda
.
current_stream
().
wait_stream
(
wgrad1_stream
)
return
(
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
*
grads
)
...
...
apex/contrib/csrc/bottleneck/bottleneck.cpp
View file @
5698eeeb
...
...
@@ -3554,19 +3554,12 @@ std::vector<at::Tensor> bottleneck_backward_init(bool explicit_nhwc, int stride_
return
outputs
;
}
at
::
Tensor
bottleneck_backward_grad_out2
(
bool
explicit_nhwc
,
int
stride_1X1
,
std
::
vector
<
at
::
Tensor
>
inputs
,
std
::
vector
<
at
::
Tensor
>
outputs
)
{
bool
requires_grad
=
inputs
[
0
].
requires_grad
();
std
::
cout
<<
std
::
fixed
;
auto
output_format
=
explicit_nhwc
?
at
::
MemoryFormat
::
Contiguous
:
at
::
MemoryFormat
::
ChannelsLast
;
void
bottleneck_backward_wgrad3
(
bool
explicit_nhwc
,
int
stride_1X1
,
std
::
vector
<
at
::
Tensor
>
inputs
,
std
::
vector
<
at
::
Tensor
>
outputs
)
{
// dconv3+drelu2+dscale2
at
::
Half
*
conv_in
=
inputs
[
13
].
data_ptr
<
at
::
Half
>
();
at
::
Half
*
dy3
=
inputs
[
10
].
data_ptr
<
at
::
Half
>
();
DEBUG_MSG
(
"[DEBUG] new dconv3 : "
<<
inputs
[
10
].
to
(
at
::
kFloat
).
sum
().
item
<
float
>
());
// wgrad
auto
wgrad3
=
outputs
[
3
];
at
::
Half
*
dw3
=
wgrad3
.
data_ptr
<
at
::
Half
>
();
...
...
@@ -3583,6 +3576,21 @@ at::Tensor bottleneck_backward_grad_out2(bool explicit_nhwc, int stride_1X1, std
CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR
);
DEBUG_MSG
(
"[DEBUG] new wgrad3 : "
<<
wgrad3
.
to
(
at
::
kFloat
).
sum
().
item
<
float
>
());
}
at
::
Tensor
bottleneck_backward_grad_out2
(
bool
explicit_nhwc
,
int
stride_1X1
,
std
::
vector
<
at
::
Tensor
>
inputs
,
std
::
vector
<
at
::
Tensor
>
outputs
)
{
bool
requires_grad
=
inputs
[
0
].
requires_grad
();
std
::
cout
<<
std
::
fixed
;
auto
output_format
=
explicit_nhwc
?
at
::
MemoryFormat
::
Contiguous
:
at
::
MemoryFormat
::
ChannelsLast
;
// dconv3+drelu2+dscale2
at
::
Half
*
conv_in
=
inputs
[
13
].
data_ptr
<
at
::
Half
>
();
at
::
Half
*
dy3
=
inputs
[
10
].
data_ptr
<
at
::
Half
>
();
DEBUG_MSG
(
"[DEBUG] new dconv3 : "
<<
inputs
[
10
].
to
(
at
::
kFloat
).
sum
().
item
<
float
>
());
// dgrad
auto
grad_out2
=
at
::
empty
(
backward_state
.
outdim2
,
inputs
[
0
].
type
(),
output_format
);
at
::
Half
*
dy2
=
grad_out2
.
data_ptr
<
at
::
Half
>
();
...
...
@@ -3769,7 +3777,7 @@ at::Tensor bottleneck_backward_grad_out1_halo(bool explicit_nhwc, int stride_1X1
return
grad_out1_halo
;
}
at
::
Tensor
bottleneck_backward_wgrad2_pad
(
bool
explicit_nhwc
,
int
stride_1X1
,
std
::
vector
<
at
::
Tensor
>
inputs
,
std
::
vector
<
at
::
Tensor
>
outputs
,
at
::
Tensor
input
,
at
::
Tensor
grad_out2
)
{
void
bottleneck_backward_wgrad2_pad
(
bool
explicit_nhwc
,
int
stride_1X1
,
std
::
vector
<
at
::
Tensor
>
inputs
,
std
::
vector
<
at
::
Tensor
>
outputs
,
at
::
Tensor
input
,
at
::
Tensor
grad_out2
)
{
std
::
cout
<<
std
::
fixed
;
auto
output_format
=
explicit_nhwc
?
at
::
MemoryFormat
::
Contiguous
:
at
::
MemoryFormat
::
ChannelsLast
;
...
...
@@ -3798,11 +3806,9 @@ at::Tensor bottleneck_backward_wgrad2_pad(bool explicit_nhwc, int stride_1X1, st
dy2
,
CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR
);
DEBUG_MSG
(
"[DEBUG] new wgrad2 : "
<<
wgrad2
.
to
(
at
::
kFloat
).
sum
().
item
<
float
>
());
return
wgrad2
;
}
at
::
Tensor
bottleneck_backward_wgrad2
(
bool
explicit_nhwc
,
int
stride_1X1
,
std
::
vector
<
at
::
Tensor
>
inputs
,
std
::
vector
<
at
::
Tensor
>
outputs
,
at
::
Tensor
grad_out2
)
{
void
bottleneck_backward_wgrad2
(
bool
explicit_nhwc
,
int
stride_1X1
,
std
::
vector
<
at
::
Tensor
>
inputs
,
std
::
vector
<
at
::
Tensor
>
outputs
,
at
::
Tensor
grad_out2
)
{
bool
requires_grad
=
inputs
[
0
].
requires_grad
();
...
...
@@ -3832,8 +3838,6 @@ at::Tensor bottleneck_backward_wgrad2(bool explicit_nhwc, int stride_1X1, std::v
dy2
,
CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR
);
DEBUG_MSG
(
"[DEBUG] new wgrad2 : "
<<
wgrad2
.
to
(
at
::
kFloat
).
sum
().
item
<
float
>
());
return
wgrad2
;
}
// compute halo cells for input volume of dimension [N,1,W,C] with padding=(0,1) to produce output volume of dimension [N,1,W,C]
...
...
@@ -3876,7 +3880,30 @@ at::Tensor bottleneck_backward_wgrad2_halo(bool explicit_nhwc, int stride_1X1, s
return
wgrad2_halo
;
}
void
bottleneck_backward_rest
(
bool
explicit_nhwc
,
int
stride_1X1
,
std
::
vector
<
at
::
Tensor
>
inputs
,
std
::
vector
<
at
::
Tensor
>
outputs
,
at
::
Tensor
grad_out2
,
at
::
Tensor
grad_out1
,
at
::
Tensor
wgrad2
)
{
void
bottleneck_backward_wgrad1
(
bool
explicit_nhwc
,
int
stride_1X1
,
std
::
vector
<
at
::
Tensor
>
inputs
,
std
::
vector
<
at
::
Tensor
>
outputs
,
at
::
Tensor
grad_out1
)
{
at
::
Half
*
x
=
inputs
[
0
].
data_ptr
<
at
::
Half
>
();
at
::
Half
*
dy1
=
grad_out1
.
data_ptr
<
at
::
Half
>
();
// dconv1+add
// wgrad
auto
wgrad1
=
outputs
[
1
];
at
::
Half
*
dw1
=
wgrad1
.
data_ptr
<
at
::
Half
>
();
run_dconv
(
backward_state
.
dimA
,
backward_state
.
padA
,
backward_state
.
convstride1X1
,
backward_state
.
dilationA
,
backward_state
.
filterdimA1
,
backward_state
.
outdimA1
,
CUDNN_DATA_HALF
,
x
,
dw1
,
dy1
,
CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR
);
}
void
bottleneck_backward_rest
(
bool
explicit_nhwc
,
int
stride_1X1
,
std
::
vector
<
at
::
Tensor
>
inputs
,
std
::
vector
<
at
::
Tensor
>
outputs
,
at
::
Tensor
grad_out2
,
at
::
Tensor
grad_out1
)
{
bool
requires_grad
=
inputs
[
0
].
requires_grad
();
...
...
@@ -3974,22 +4001,6 @@ void bottleneck_backward_rest(bool explicit_nhwc, int stride_1X1, std::vector<at
dx_conv4
=
inputs
[
11
].
data_ptr
<
at
::
Half
>
();
}
// dconv1+add
// wgrad
auto
wgrad1
=
outputs
[
1
];
at
::
Half
*
dw1
=
wgrad1
.
data_ptr
<
at
::
Half
>
();
run_dconv
(
backward_state
.
dimA
,
backward_state
.
padA
,
backward_state
.
convstride1X1
,
backward_state
.
dilationA
,
backward_state
.
filterdimA1
,
backward_state
.
outdimA1
,
CUDNN_DATA_HALF
,
x
,
dw1
,
dy1
,
CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR
);
// dgrad
w
=
inputs
[
1
].
data_ptr
<
at
::
Half
>
();
auto
grad_x
=
outputs
[
0
];
...
...
@@ -4056,5 +4067,7 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m
.
def
(
"backward_wgrad2_pad"
,
&
bottleneck_backward_wgrad2_pad
,
"Bottleneck block backward"
);
m
.
def
(
"backward_wgrad2"
,
&
bottleneck_backward_wgrad2
,
"Bottleneck block backward"
);
m
.
def
(
"backward_wgrad2_halo"
,
&
bottleneck_backward_wgrad2_halo
,
"Bottleneck block backward"
);
m
.
def
(
"backward_wgrad3"
,
&
bottleneck_backward_wgrad3
,
"Bottleneck block backward"
);
m
.
def
(
"backward_wgrad1"
,
&
bottleneck_backward_wgrad1
,
"Bottleneck block backward"
);
m
.
def
(
"backward_rest"
,
&
bottleneck_backward_rest
,
"Bottleneck block backward"
);
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment