"torchvision/git@developer.sourcefind.cn:OpenDAS/vision.git" did not exist on "766721b1dfd7a92130146a549c4fcca15cc069b2"
Commit 337056c1 authored by Michael Carilli's avatar Michael Carilli
Browse files

New downscale kernel is working but not perf tested

parent 45537d34
...@@ -40,16 +40,8 @@ class AmpHandle(object): ...@@ -40,16 +40,8 @@ class AmpHandle(object):
'use `optimizer.scale_loss(loss)`.') 'use `optimizer.scale_loss(loss)`.')
# TODO: this code block is duplicated here and `opt.py`. Unify. # TODO: this code block is duplicated here and `opt.py`. Unify.
loss_backward = loss.backward
def warning_wrapper():
warnings.warn("You called .backward() on the unscaled loss "
"inside a scale_loss block. This is almost "
"certainly an error.", stacklevel=2)
loss_backward()
loss.backward = warning_wrapper
loss_scale = self._default_scaler.loss_scale() loss_scale = self._default_scaler.loss_scale()
yield loss * loss_scale yield loss * loss_scale
loss.backward = loss_backward
should_skip = self._default_scaler.unscale_and_update( should_skip = self._default_scaler.unscale_and_update(
optimizer.param_groups, loss_scale) optimizer.param_groups, loss_scale)
......
...@@ -21,14 +21,6 @@ class OptimWrapper(object): ...@@ -21,14 +21,6 @@ class OptimWrapper(object):
yield loss yield loss
return return
loss_backward = loss.backward
def warning_wrapper():
warnings.warn("You called .backward() on the unscaled loss "
"inside a scale_loss block. This is almost "
"certainly an error.", stacklevel=2)
loss_backward()
loss.backward = warning_wrapper
# When there are multiple losses per-optimizer, we need # When there are multiple losses per-optimizer, we need
# to save out current grad accumulation, since we won't be # to save out current grad accumulation, since we won't be
# able to unscale this particulare loss once the grads are # able to unscale this particulare loss once the grads are
...@@ -44,7 +36,6 @@ class OptimWrapper(object): ...@@ -44,7 +36,6 @@ class OptimWrapper(object):
loss_scale = self._cur_loss_scaler().loss_scale() loss_scale = self._cur_loss_scaler().loss_scale()
yield loss * loss_scale yield loss * loss_scale
loss.backward = loss_backward
self._skip_next[self._loss_idx] = self._cur_loss_scaler().unscale_and_update( self._skip_next[self._loss_idx] = self._cur_loss_scaler().unscale_and_update(
self._optimizer.param_groups, loss_scale) self._optimizer.param_groups, loss_scale)
......
...@@ -32,7 +32,7 @@ class LossScaler(object): ...@@ -32,7 +32,7 @@ class LossScaler(object):
import amp_C import amp_C
LossScaler.has_fused_kernel = True LossScaler.has_fused_kernel = True
LossScaler.scale_check_overflow_cuda = amp_C.scale_check_overflow LossScaler.scale_check_overflow_cuda = amp_C.scale_check_overflow
self._overflow_buf = torch.cuda.ByteTensor(1024,) self._overflow_buf = torch.cuda.IntTensor([0])
except ImportError as err: except ImportError as err:
if not LossScaler.warned_no_fused_kernel: if not LossScaler.warned_no_fused_kernel:
print("Warning: Amp fused downscale kernel is unavailable, possibly because apex " print("Warning: Amp fused downscale kernel is unavailable, possibly because apex "
...@@ -53,7 +53,8 @@ class LossScaler(object): ...@@ -53,7 +53,8 @@ class LossScaler(object):
if LossScaler.has_fused_kernel and p.grad.data.type() == "torch.cuda.FloatTensor": if LossScaler.has_fused_kernel and p.grad.data.type() == "torch.cuda.FloatTensor":
LossScaler.scale_check_overflow_cuda(p.grad.data, LossScaler.scale_check_overflow_cuda(p.grad.data,
1./scale, 1./scale,
self._overflow_buf) self._overflow_buf,
p.grad.data)
else: else:
if (p.grad.data.type() != "torch.cuda.FloatTensor" if (p.grad.data.type() != "torch.cuda.FloatTensor"
and not LossScaler.warned_fp16_grad): and not LossScaler.warned_fp16_grad):
...@@ -69,7 +70,7 @@ class LossScaler(object): ...@@ -69,7 +70,7 @@ class LossScaler(object):
# If the fused kernel is available, we only need one D2H memcopy and sync. # If the fused kernel is available, we only need one D2H memcopy and sync.
if LossScaler.has_fused_kernel and not self._has_overflow: if LossScaler.has_fused_kernel and not self._has_overflow:
self._has_overflow = self._overflow_buf.any() self._has_overflow = self._overflow_buf.item()
if self._has_overflow: if self._has_overflow:
should_skip = True should_skip = True
......
#include <torch/extension.h> #include <torch/extension.h>
void scale_check_overflow_cuda(const at::Tensor& d_grads, float scale, const at::Tensor& d_buf); void scale_check_overflow_cuda(const at::Tensor& grads,
float scale,
const at::Tensor& d_buf,
const at::Tensor& downscaled_grads);
void scale_check_overflow(at::Tensor grads, float scale, at::Tensor overflow_buf) void scale_check_overflow(at::Tensor grads,
float scale,
at::Tensor overflow_buf,
at::Tensor downscaled_grads)
// const at::optional<at::Tensor> downscaled_grads)
{ {
AT_CHECK(grads.type().is_cuda(), "grads must be a CUDA tensor"); AT_CHECK(grads.type().is_cuda(), "grads must be a CUDA tensor");
AT_CHECK(grads.is_contiguous(), "grads must be contiguous"); AT_CHECK(grads.is_contiguous(), "grads must be contiguous");
AT_CHECK(overflow_buf.type().is_cuda(), "overflow_buf must be a CUDA tensor"); AT_CHECK(overflow_buf.type().is_cuda(), "overflow_buf must be a CUDA tensor");
AT_CHECK(overflow_buf.is_contiguous(), "overflow_buf must be contiguous"); AT_CHECK(overflow_buf.is_contiguous(), "overflow_buf must be contiguous");
AT_CHECK(downscaled_grads.type().is_cuda(), "downscaled_grads must be a CUDA tensor");
AT_CHECK(downscaled_grads.is_contiguous(), "downscaled_grads must be contiguous");
// Make sure we are downscaling the FP32 master grads // Make sure we are downscaling the FP32 master grads
AT_CHECK(grads.type().scalarType() == at::ScalarType::Float, AT_CHECK(downscaled_grads.type().scalarType() == at::ScalarType::Float,
"grads supplied to scale_check_overflow should be fp32 (master grads).") "The output grads supplied to scale_check_overflow should be fp32 (master grads).")
scale_check_overflow_cuda(grads, scale, overflow_buf); scale_check_overflow_cuda(grads, scale, overflow_buf, downscaled_grads);
} }
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
......
...@@ -4,101 +4,76 @@ ...@@ -4,101 +4,76 @@
#include <ATen/cuda/Exceptions.h> #include <ATen/cuda/Exceptions.h>
#include <assert.h> #include <assert.h>
#include <cuda_runtime.h>
#define BLOCK_SIZE 1024 #define BLOCK_SIZE 1024
#define MAX_BLOCKS 1024 #define MAX_BLOCKS 1024
// It makes sense to lock the type to "float" here because the downscaling // It makes sense to lock the output type to fp32 because the downscaled
// should only be applied to the FP32 master gradients. Also, if "in" were // grads should be master grads (and in the case of Amp, the params and their
// a different type, it would require divergent code for the vectorized load logic. // gradients should always be fp32.
// TODO: template<typename in_t>
// Update overflow check to use reduction from kernel_utils.cuh with __global__ void scale_reduce_overflow(in_t* in,
// ReduceOp from THCTensorMathReduce.cuh. float* out,
__global__ void scale_reduce_overflow size_t n,
(float *in, float scale,
size_t n, volatile int* overflow_global)
float scale,
uint8_t *overflow_out)
{ {
__shared__ uint8_t cta_overflow[BLOCK_SIZE]; __shared__ int overflow;
int tid = blockIdx.x * blockDim.x + threadIdx.x; int tid = blockIdx.x*blockDim.x + threadIdx.x;
int stride = gridDim.x * blockDim.x; int stride = gridDim.x*blockDim.x;
uint8_t my_overflow = 0; // Non-divergent exit condition for the __syncthreads
for (int i = tid * 4; i < n; i+= stride * 4) { for(int i = tid; i - threadIdx.x < n; i += stride)
if (i < (n - 3)) { {
float4 f4 = ((float4*)in)[i / 4]; if(threadIdx.x == 0)
if (isfinite(f4.x)) { overflow = *overflow_global;
f4.x *= scale;
} else {
my_overflow = 1;
}
if (isfinite(f4.y)) {
f4.y *= scale;
} else {
my_overflow = 1;
}
if (isfinite(f4.z)) {
f4.z *= scale;
} else {
my_overflow = 1;
}
if (isfinite(f4.w)) {
f4.w *= scale;
} else {
my_overflow = 1;
}
((float4*)in)[i / 4] = f4;
} else {
for (; i < n; ++i) {
if (isfinite(in[i])) {
in[i] *= scale;
} else {
my_overflow = 1;
}
}
}
}
int tIdx = threadIdx.x;
cta_overflow[tIdx] = my_overflow;
__syncthreads(); __syncthreads();
int participating = BLOCK_SIZE / 2; if(overflow == 1)
while (participating > 0) { break;
if (tIdx < participating) {
cta_overflow[tIdx] = max(cta_overflow[tIdx], if(tid < n)
cta_overflow[tIdx + participating]); {
} float incoming_val = static_cast<float>(in[i]);
participating /= 2; if(isfinite(incoming_val))
__syncthreads(); out[i] = incoming_val*scale;
} else
if (tIdx == 0) { *overflow_global = 1; // Blindly fire off a write. These will race but that's ok.
overflow_out[blockIdx.x] = max(cta_overflow[0], }
overflow_out[blockIdx.x]); }
}
} }
void scale_check_overflow_cuda void scale_check_overflow_cuda
(const at::Tensor& d_grads, (const at::Tensor& grads,
float scale, float scale,
const at::Tensor& d_buf) const at::Tensor& overflow_buf,
const at::Tensor& downscaled_grads)
{ {
using namespace at; using namespace at;
cudaStream_t stream = at::cuda::getCurrentCUDAStream(); cudaStream_t stream = at::cuda::getCurrentCUDAStream();
size_t n = d_grads.numel(); size_t n = grads.numel();
size_t buf_n = d_buf.numel();
int num_blks = 160;
// Lock the output (downscaled) type to float.
AT_DISPATCH_FLOATING_TYPES_AND_HALF(grads.type(),
"scale_check_overflow_cuda",
[&]
{
// using accscalar_t = acc_type<scalar_t, true>;
scale_reduce_overflow<<<num_blks, BLOCK_SIZE, 0, stream>>>
(grads.data<scalar_t>(),
downscaled_grads.data<float>(),
n,
scale,
overflow_buf.data<int>());
});
int num_blks = min((int(n) + BLOCK_SIZE - 1) / BLOCK_SIZE,
MAX_BLOCKS);
assert(buf_n >= num_blks);
scale_reduce_overflow<<<num_blks, BLOCK_SIZE, 0, stream>>>
(d_grads.data<float>(),
n,
scale,
d_buf.data<uint8_t>());
AT_CUDA_CHECK(cudaGetLastError()); AT_CUDA_CHECK(cudaGetLastError());
} }
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment