Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
apex
Commits
337056c1
Commit
337056c1
authored
Feb 05, 2019
by
Michael Carilli
Browse files
New downscale kernel is working but not perf tested
parent
45537d34
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
69 additions
and
101 deletions
+69
-101
apex/amp/handle.py
apex/amp/handle.py
+0
-8
apex/amp/opt.py
apex/amp/opt.py
+0
-9
apex/amp/scaler.py
apex/amp/scaler.py
+4
-3
csrc/scale_check_overflow.cpp
csrc/scale_check_overflow.cpp
+14
-5
csrc/scale_check_overflow_kernel.cu
csrc/scale_check_overflow_kernel.cu
+51
-76
No files found.
apex/amp/handle.py
View file @
337056c1
...
...
@@ -40,16 +40,8 @@ class AmpHandle(object):
'use `optimizer.scale_loss(loss)`.'
)
# TODO: this code block is duplicated here and `opt.py`. Unify.
loss_backward
=
loss
.
backward
def
warning_wrapper
():
warnings
.
warn
(
"You called .backward() on the unscaled loss "
"inside a scale_loss block. This is almost "
"certainly an error."
,
stacklevel
=
2
)
loss_backward
()
loss
.
backward
=
warning_wrapper
loss_scale
=
self
.
_default_scaler
.
loss_scale
()
yield
loss
*
loss_scale
loss
.
backward
=
loss_backward
should_skip
=
self
.
_default_scaler
.
unscale_and_update
(
optimizer
.
param_groups
,
loss_scale
)
...
...
apex/amp/opt.py
View file @
337056c1
...
...
@@ -21,14 +21,6 @@ class OptimWrapper(object):
yield
loss
return
loss_backward
=
loss
.
backward
def
warning_wrapper
():
warnings
.
warn
(
"You called .backward() on the unscaled loss "
"inside a scale_loss block. This is almost "
"certainly an error."
,
stacklevel
=
2
)
loss_backward
()
loss
.
backward
=
warning_wrapper
# When there are multiple losses per-optimizer, we need
# to save out current grad accumulation, since we won't be
# able to unscale this particulare loss once the grads are
...
...
@@ -44,7 +36,6 @@ class OptimWrapper(object):
loss_scale
=
self
.
_cur_loss_scaler
().
loss_scale
()
yield
loss
*
loss_scale
loss
.
backward
=
loss_backward
self
.
_skip_next
[
self
.
_loss_idx
]
=
self
.
_cur_loss_scaler
().
unscale_and_update
(
self
.
_optimizer
.
param_groups
,
loss_scale
)
...
...
apex/amp/scaler.py
View file @
337056c1
...
...
@@ -32,7 +32,7 @@ class LossScaler(object):
import
amp_C
LossScaler
.
has_fused_kernel
=
True
LossScaler
.
scale_check_overflow_cuda
=
amp_C
.
scale_check_overflow
self
.
_overflow_buf
=
torch
.
cuda
.
Byte
Tensor
(
1024
,
)
self
.
_overflow_buf
=
torch
.
cuda
.
Int
Tensor
(
[
0
]
)
except
ImportError
as
err
:
if
not
LossScaler
.
warned_no_fused_kernel
:
print
(
"Warning: Amp fused downscale kernel is unavailable, possibly because apex "
...
...
@@ -53,7 +53,8 @@ class LossScaler(object):
if
LossScaler
.
has_fused_kernel
and
p
.
grad
.
data
.
type
()
==
"torch.cuda.FloatTensor"
:
LossScaler
.
scale_check_overflow_cuda
(
p
.
grad
.
data
,
1.
/
scale
,
self
.
_overflow_buf
)
self
.
_overflow_buf
,
p
.
grad
.
data
)
else
:
if
(
p
.
grad
.
data
.
type
()
!=
"torch.cuda.FloatTensor"
and
not
LossScaler
.
warned_fp16_grad
):
...
...
@@ -69,7 +70,7 @@ class LossScaler(object):
# If the fused kernel is available, we only need one D2H memcopy and sync.
if
LossScaler
.
has_fused_kernel
and
not
self
.
_has_overflow
:
self
.
_has_overflow
=
self
.
_overflow_buf
.
any
()
self
.
_has_overflow
=
self
.
_overflow_buf
.
item
()
if
self
.
_has_overflow
:
should_skip
=
True
...
...
csrc/scale_check_overflow.cpp
View file @
337056c1
#include <torch/extension.h>
void
scale_check_overflow_cuda
(
const
at
::
Tensor
&
d_grads
,
float
scale
,
const
at
::
Tensor
&
d_buf
);
void
scale_check_overflow_cuda
(
const
at
::
Tensor
&
grads
,
float
scale
,
const
at
::
Tensor
&
d_buf
,
const
at
::
Tensor
&
downscaled_grads
);
void
scale_check_overflow
(
at
::
Tensor
grads
,
float
scale
,
at
::
Tensor
overflow_buf
)
void
scale_check_overflow
(
at
::
Tensor
grads
,
float
scale
,
at
::
Tensor
overflow_buf
,
at
::
Tensor
downscaled_grads
)
// const at::optional<at::Tensor> downscaled_grads)
{
AT_CHECK
(
grads
.
type
().
is_cuda
(),
"grads must be a CUDA tensor"
);
AT_CHECK
(
grads
.
is_contiguous
(),
"grads must be contiguous"
);
AT_CHECK
(
overflow_buf
.
type
().
is_cuda
(),
"overflow_buf must be a CUDA tensor"
);
AT_CHECK
(
overflow_buf
.
is_contiguous
(),
"overflow_buf must be contiguous"
);
AT_CHECK
(
downscaled_grads
.
type
().
is_cuda
(),
"downscaled_grads must be a CUDA tensor"
);
AT_CHECK
(
downscaled_grads
.
is_contiguous
(),
"downscaled_grads must be contiguous"
);
// Make sure we are downscaling the FP32 master grads
AT_CHECK
(
grads
.
type
().
scalarType
()
==
at
::
ScalarType
::
Float
,
"grads supplied to scale_check_overflow should be fp32 (master grads)."
)
AT_CHECK
(
downscaled_
grads
.
type
().
scalarType
()
==
at
::
ScalarType
::
Float
,
"
The output
grads supplied to scale_check_overflow should be fp32 (master grads)."
)
scale_check_overflow_cuda
(
grads
,
scale
,
overflow_buf
);
scale_check_overflow_cuda
(
grads
,
scale
,
overflow_buf
,
downscaled_grads
);
}
PYBIND11_MODULE
(
TORCH_EXTENSION_NAME
,
m
)
{
...
...
csrc/scale_check_overflow_kernel.cu
View file @
337056c1
...
...
@@ -4,101 +4,76 @@
#include <ATen/cuda/Exceptions.h>
#include <assert.h>
#include <cuda_runtime.h>
#define BLOCK_SIZE 1024
#define MAX_BLOCKS 1024
// It makes sense to lock the type to
"float" here
because the downscal
ing
// should
only be applied to the FP32 master gradients. Also, if "in" were
//
a different type, it would require divergent code for the vectorized load logic
.
// It makes sense to lock the
output
type to
fp32
because the downscal
ed
//
grads
should
be master grads (and in the case of Amp, the params and their
//
gradients should always be fp32
.
// TODO:
// Update overflow check to use reduction from kernel_utils.cuh with
// ReduceOp from THCTensorMathReduce.cuh.
__global__
void
scale_reduce_overflow
(
float
*
in
,
size_t
n
,
float
scale
,
uint8_t
*
overflow_out
)
template
<
typename
in_t
>
__global__
void
scale_reduce_overflow
(
in_t
*
in
,
float
*
out
,
size_t
n
,
float
scale
,
volatile
int
*
overflow_global
)
{
__shared__
u
int
8_t
cta_overflow
[
BLOCK_SIZE
]
;
__shared__
int
overflow
;
int
tid
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
stride
=
gridDim
.
x
*
blockDim
.
x
;
int
tid
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
stride
=
gridDim
.
x
*
blockDim
.
x
;
uint8_t
my_overflow
=
0
;
for
(
int
i
=
tid
*
4
;
i
<
n
;
i
+=
stride
*
4
)
{
if
(
i
<
(
n
-
3
))
{
float4
f4
=
((
float4
*
)
in
)[
i
/
4
];
if
(
isfinite
(
f4
.
x
))
{
f4
.
x
*=
scale
;
}
else
{
my_overflow
=
1
;
}
if
(
isfinite
(
f4
.
y
))
{
f4
.
y
*=
scale
;
}
else
{
my_overflow
=
1
;
}
if
(
isfinite
(
f4
.
z
))
{
f4
.
z
*=
scale
;
}
else
{
my_overflow
=
1
;
}
if
(
isfinite
(
f4
.
w
))
{
f4
.
w
*=
scale
;
}
else
{
my_overflow
=
1
;
}
((
float4
*
)
in
)[
i
/
4
]
=
f4
;
}
else
{
for
(;
i
<
n
;
++
i
)
{
if
(
isfinite
(
in
[
i
]))
{
in
[
i
]
*=
scale
;
}
else
{
my_overflow
=
1
;
}
}
}
}
// Non-divergent exit condition for the __syncthreads
for
(
int
i
=
tid
;
i
-
threadIdx
.
x
<
n
;
i
+=
stride
)
{
if
(
threadIdx
.
x
==
0
)
overflow
=
*
overflow_global
;
int
tIdx
=
threadIdx
.
x
;
cta_overflow
[
tIdx
]
=
my_overflow
;
__syncthreads
();
int
participating
=
BLOCK_SIZE
/
2
;
while
(
participating
>
0
)
{
if
(
tIdx
<
participating
)
{
cta_overflow
[
tIdx
]
=
max
(
cta_overflow
[
tIdx
],
cta_overflow
[
tIdx
+
participating
]);
}
participating
/=
2
;
__syncthreads
();
}
if
(
tIdx
==
0
)
{
overflow_out
[
blockIdx
.
x
]
=
max
(
cta_overflow
[
0
],
overflow_out
[
blockIdx
.
x
]);
}
if
(
overflow
==
1
)
break
;
if
(
tid
<
n
)
{
float
incoming_val
=
static_cast
<
float
>
(
in
[
i
]);
if
(
isfinite
(
incoming_val
))
out
[
i
]
=
incoming_val
*
scale
;
else
*
overflow_global
=
1
;
// Blindly fire off a write. These will race but that's ok.
}
}
}
void
scale_check_overflow_cuda
(
const
at
::
Tensor
&
d_
grads
,
(
const
at
::
Tensor
&
grads
,
float
scale
,
const
at
::
Tensor
&
d_buf
)
const
at
::
Tensor
&
overflow_buf
,
const
at
::
Tensor
&
downscaled_grads
)
{
using
namespace
at
;
cudaStream_t
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
size_t
n
=
d_grads
.
numel
();
size_t
buf_n
=
d_buf
.
numel
();
size_t
n
=
grads
.
numel
();
int
num_blks
=
160
;
// Lock the output (downscaled) type to float.
AT_DISPATCH_FLOATING_TYPES_AND_HALF
(
grads
.
type
(),
"scale_check_overflow_cuda"
,
[
&
]
{
// using accscalar_t = acc_type<scalar_t, true>;
scale_reduce_overflow
<<<
num_blks
,
BLOCK_SIZE
,
0
,
stream
>>>
(
grads
.
data
<
scalar_t
>
(),
downscaled_grads
.
data
<
float
>
(),
n
,
scale
,
overflow_buf
.
data
<
int
>
());
});
int
num_blks
=
min
((
int
(
n
)
+
BLOCK_SIZE
-
1
)
/
BLOCK_SIZE
,
MAX_BLOCKS
);
assert
(
buf_n
>=
num_blks
);
scale_reduce_overflow
<<<
num_blks
,
BLOCK_SIZE
,
0
,
stream
>>>
(
d_grads
.
data
<
float
>
(),
n
,
scale
,
d_buf
.
data
<
uint8_t
>
());
AT_CUDA_CHECK
(
cudaGetLastError
());
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment