Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
apex
Commits
8818ba9e
Commit
8818ba9e
authored
Feb 05, 2019
by
Michael Carilli
Browse files
Merge branch 'new_downscale_kernel'
parents
9288ba5c
a5bc76db
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
72 additions
and
84 deletions
+72
-84
apex/amp/scaler.py
apex/amp/scaler.py
+4
-3
csrc/scale_check_overflow.cpp
csrc/scale_check_overflow.cpp
+15
-5
csrc/scale_check_overflow_kernel.cu
csrc/scale_check_overflow_kernel.cu
+53
-76
No files found.
apex/amp/scaler.py
View file @
8818ba9e
...
...
@@ -32,7 +32,7 @@ class LossScaler(object):
import
amp_C
LossScaler
.
has_fused_kernel
=
True
LossScaler
.
scale_check_overflow_cuda
=
amp_C
.
scale_check_overflow
self
.
_overflow_buf
=
torch
.
cuda
.
Byte
Tensor
(
1024
,
)
self
.
_overflow_buf
=
torch
.
cuda
.
Int
Tensor
(
[
0
]
)
except
ImportError
as
err
:
if
not
LossScaler
.
warned_no_fused_kernel
:
print
(
"Warning: Amp fused downscale kernel is unavailable, possibly because apex "
...
...
@@ -53,7 +53,8 @@ class LossScaler(object):
if
LossScaler
.
has_fused_kernel
and
p
.
grad
.
data
.
type
()
==
"torch.cuda.FloatTensor"
:
LossScaler
.
scale_check_overflow_cuda
(
p
.
grad
.
data
,
1.
/
scale
,
self
.
_overflow_buf
)
self
.
_overflow_buf
,
p
.
grad
.
data
)
else
:
if
(
p
.
grad
.
data
.
type
()
!=
"torch.cuda.FloatTensor"
and
not
LossScaler
.
warned_fp16_grad
):
...
...
@@ -69,7 +70,7 @@ class LossScaler(object):
# If the fused kernel is available, we only need one D2H memcopy and sync.
if
LossScaler
.
has_fused_kernel
and
not
self
.
_has_overflow
:
self
.
_has_overflow
=
self
.
_overflow_buf
.
any
()
self
.
_has_overflow
=
self
.
_overflow_buf
.
item
()
if
self
.
_has_overflow
:
should_skip
=
True
...
...
csrc/scale_check_overflow.cpp
View file @
8818ba9e
#include <torch/extension.h>
void
scale_check_overflow_cuda
(
const
at
::
Tensor
&
d_grads
,
float
scale
,
const
at
::
Tensor
&
d_buf
);
void
scale_check_overflow_cuda
(
const
at
::
Tensor
&
grads
,
float
scale
,
const
at
::
Tensor
&
d_buf
,
const
at
::
Tensor
&
downscaled_grads
);
void
scale_check_overflow
(
at
::
Tensor
grads
,
float
scale
,
at
::
Tensor
overflow_buf
)
void
scale_check_overflow
(
at
::
Tensor
grads
,
float
scale
,
at
::
Tensor
overflow_buf
,
at
::
Tensor
downscaled_grads
)
// const at::optional<at::Tensor> downscaled_grads)
{
AT_CHECK
(
grads
.
type
().
is_cuda
(),
"grads must be a CUDA tensor"
);
AT_CHECK
(
grads
.
is_contiguous
(),
"grads must be contiguous"
);
AT_CHECK
(
overflow_buf
.
type
().
is_cuda
(),
"overflow_buf must be a CUDA tensor"
);
AT_CHECK
(
overflow_buf
.
is_contiguous
(),
"overflow_buf must be contiguous"
);
AT_CHECK
(
downscaled_grads
.
type
().
is_cuda
(),
"downscaled_grads must be a CUDA tensor"
);
AT_CHECK
(
downscaled_grads
.
is_contiguous
(),
"downscaled_grads must be contiguous"
);
// Make sure we are downscaling the FP32 master grads
AT_CHECK
(
grads
.
type
().
scalarType
()
==
at
::
ScalarType
::
Float
,
"grads supplied to scale_check_overflow should be fp32 (master grads)."
)
AT_CHECK
(
downscaled_grads
.
type
().
scalarType
()
==
at
::
ScalarType
::
Float
,
"The output grads supplied to scale_check_overflow should be fp32 (master grads)."
)
AT_CHECK
(
grads
.
numel
()
==
downscaled_grads
.
numel
(),
"Input and output grads must be the same size."
);
scale_check_overflow_cuda
(
grads
,
scale
,
overflow_buf
);
scale_check_overflow_cuda
(
grads
,
scale
,
overflow_buf
,
downscaled_grads
);
}
PYBIND11_MODULE
(
TORCH_EXTENSION_NAME
,
m
)
{
...
...
csrc/scale_check_overflow_kernel.cu
View file @
8818ba9e
...
...
@@ -4,101 +4,78 @@
#include <ATen/cuda/Exceptions.h>
#include <assert.h>
#include <cuda_runtime.h>
#define BLOCK_SIZE 1024
#define
MAX_
BLOCKS 10
24
#define
N
BLOCKS 1
6
0
// It makes sense to lock the type to
"float" here
because the downscal
ing
// should
only be applied to the FP32 master gradients. Also, if "in" were
//
a different type, it would require divergent code for the vectorized load logic
.
// It makes sense to lock the
output
type to
fp32
because the downscal
ed
//
grads
should
be master grads (and in the case of Amp, the params and their
//
gradients should always be fp32
.
// TODO:
// Update overflow check to use reduction from kernel_utils.cuh with
// ReduceOp from THCTensorMathReduce.cuh.
__global__
void
scale_reduce_overflow
(
float
*
in
,
size_t
n
,
float
scale
,
uint8_t
*
overflow_out
)
// This can be optimized with ILP but it's fine for now.
template
<
typename
in_t
>
__global__
void
scale_reduce_overflow
(
in_t
*
in
,
float
*
out
,
int
n
,
float
scale
,
volatile
int
*
overflow_global
)
{
__shared__
u
int
8_t
cta_overflow
[
BLOCK_SIZE
]
;
__shared__
int
overflow
;
int
tid
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
stride
=
gridDim
.
x
*
blockDim
.
x
;
int
tid
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
stride
=
gridDim
.
x
*
blockDim
.
x
;
uint8_t
my_overflow
=
0
;
for
(
int
i
=
tid
*
4
;
i
<
n
;
i
+=
stride
*
4
)
{
if
(
i
<
(
n
-
3
))
{
float4
f4
=
((
float4
*
)
in
)[
i
/
4
];
if
(
isfinite
(
f4
.
x
))
{
f4
.
x
*=
scale
;
}
else
{
my_overflow
=
1
;
}
if
(
isfinite
(
f4
.
y
))
{
f4
.
y
*=
scale
;
}
else
{
my_overflow
=
1
;
}
if
(
isfinite
(
f4
.
z
))
{
f4
.
z
*=
scale
;
}
else
{
my_overflow
=
1
;
}
if
(
isfinite
(
f4
.
w
))
{
f4
.
w
*=
scale
;
}
else
{
my_overflow
=
1
;
}
((
float4
*
)
in
)[
i
/
4
]
=
f4
;
}
else
{
for
(;
i
<
n
;
++
i
)
{
if
(
isfinite
(
in
[
i
]))
{
in
[
i
]
*=
scale
;
}
else
{
my_overflow
=
1
;
}
}
}
}
// Non-divergent exit condition for the __syncthreads
for
(
int
i
=
tid
;
i
-
threadIdx
.
x
<
n
;
i
+=
stride
)
{
if
(
threadIdx
.
x
==
0
)
overflow
=
*
overflow_global
;
int
tIdx
=
threadIdx
.
x
;
cta_overflow
[
tIdx
]
=
my_overflow
;
__syncthreads
();
int
participating
=
BLOCK_SIZE
/
2
;
while
(
participating
>
0
)
{
if
(
tIdx
<
participating
)
{
cta_overflow
[
tIdx
]
=
max
(
cta_overflow
[
tIdx
],
cta_overflow
[
tIdx
+
participating
]);
}
participating
/=
2
;
__syncthreads
();
}
if
(
tIdx
==
0
)
{
overflow_out
[
blockIdx
.
x
]
=
max
(
cta_overflow
[
0
],
overflow_out
[
blockIdx
.
x
]);
if
(
overflow
==
1
)
break
;
if
(
i
<
n
)
{
float
incoming_val
=
static_cast
<
float
>
(
in
[
i
]);
if
(
isfinite
(
incoming_val
))
out
[
i
]
=
incoming_val
*
scale
;
else
*
overflow_global
=
1
;
// Blindly fire off a write. These will race but that's ok.
// This is NOT guaranteed to be seen immediately by thread 0 on the next iteration.
// I wonder if there's a way we can rig the short-circuiting with only one syncthreads.
// It's possible we can just lean on the cache (no smem or syncs) and still be fast.
}
}
}
void
scale_check_overflow_cuda
(
const
at
::
Tensor
&
d_
grads
,
(
const
at
::
Tensor
&
grads
,
float
scale
,
const
at
::
Tensor
&
d_buf
)
const
at
::
Tensor
&
overflow_buf
,
const
at
::
Tensor
&
downscaled_grads
)
{
using
namespace
at
;
cudaStream_t
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
size_t
n
=
d_grads
.
numel
();
size_t
buf_n
=
d_buf
.
numel
();
int
n
=
grads
.
numel
();
// Lock the output (downscaled) type to float.
AT_DISPATCH_FLOATING_TYPES_AND_HALF
(
grads
.
type
(),
"scale_check_overflow_cuda"
,
[
&
]
{
// using accscalar_t = acc_type<scalar_t, true>;
scale_reduce_overflow
<<<
NBLOCKS
,
BLOCK_SIZE
,
0
,
stream
>>>
(
grads
.
data
<
scalar_t
>
(),
downscaled_grads
.
data
<
float
>
(),
n
,
scale
,
overflow_buf
.
data
<
int
>
());
});
int
num_blks
=
min
((
int
(
n
)
+
BLOCK_SIZE
-
1
)
/
BLOCK_SIZE
,
MAX_BLOCKS
);
assert
(
buf_n
>=
num_blks
);
scale_reduce_overflow
<<<
num_blks
,
BLOCK_SIZE
,
0
,
stream
>>>
(
d_grads
.
data
<
float
>
(),
n
,
scale
,
d_buf
.
data
<
uint8_t
>
());
AT_CUDA_CHECK
(
cudaGetLastError
());
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment