New downscale kernel is working but not perf tested

337056c1 · Michael Carilli · 45537d34 · 337056c1 · 337056c1 · 337056c1
Commit 337056c1 authored Feb 05, 2019 by Michael Carilli
5 changed files
--- a/apex/amp/handle.py
+++ b/apex/amp/handle.py
@@ -40,16 +40,8 @@ class AmpHandle(object):
                'use `optimizer.scale_loss(loss)`.')

        # TODO: this code block is duplicated here and `opt.py`. Unify.
-        loss_backward = loss.backward
-        def warning_wrapper():
-            warnings.warn("You called .backward() on the unscaled loss "
-                          "inside a scale_loss block. This is almost "
-                          "certainly an error.", stacklevel=2)
-            loss_backward()
-        loss.backward = warning_wrapper
        loss_scale = self._default_scaler.loss_scale()
        yield loss * loss_scale
-        loss.backward = loss_backward

        should_skip = self._default_scaler.unscale_and_update(
            optimizer.param_groups, loss_scale)

--- a/apex/amp/opt.py
+++ b/apex/amp/opt.py
@@ -21,14 +21,6 @@ class OptimWrapper(object):
            yield loss
            return

-        loss_backward = loss.backward
-        def warning_wrapper():
-            warnings.warn("You called .backward() on the unscaled loss "
-                          "inside a scale_loss block. This is almost "
-                          "certainly an error.", stacklevel=2)
-            loss_backward()
-        loss.backward = warning_wrapper
-
        # When there are multiple losses per-optimizer, we need
        # to save out current grad accumulation, since we won't be
        # able to unscale this particulare loss once the grads are
@@ -44,7 +36,6 @@ class OptimWrapper(object):
        
        loss_scale = self._cur_loss_scaler().loss_scale()
        yield loss * loss_scale
-        loss.backward = loss_backward

        self._skip_next[self._loss_idx] = self._cur_loss_scaler().unscale_and_update(
            self._optimizer.param_groups, loss_scale)

--- a/apex/amp/scaler.py
+++ b/apex/amp/scaler.py
@@ -32,7 +32,7 @@ class LossScaler(object):
            import amp_C
            LossScaler.has_fused_kernel = True
            LossScaler.scale_check_overflow_cuda = amp_C.scale_check_overflow
-            self._overflow_buf = torch.cuda.ByteTensor(1024,)
+            self._overflow_buf = torch.cuda.IntTensor([0])
        except ImportError as err:
            if not LossScaler.warned_no_fused_kernel:
                print("Warning:  Amp fused downscale kernel is unavailable, possibly because apex "
@@ -53,7 +53,8 @@ class LossScaler(object):
                if LossScaler.has_fused_kernel and p.grad.data.type() == "torch.cuda.FloatTensor":
                    LossScaler.scale_check_overflow_cuda(p.grad.data,
                                                         1./scale,
-                                                         self._overflow_buf)
+                                                         self._overflow_buf,
+                                                         p.grad.data)
                else:
                    if (p.grad.data.type() != "torch.cuda.FloatTensor"
                            and not LossScaler.warned_fp16_grad):
@@ -69,7 +70,7 @@ class LossScaler(object):

        # If the fused kernel is available, we only need one D2H memcopy and sync.
        if LossScaler.has_fused_kernel and not self._has_overflow:
-            self._has_overflow = self._overflow_buf.any()
+            self._has_overflow = self._overflow_buf.item()

        if self._has_overflow:
            should_skip = True

--- a/csrc/scale_check_overflow.cpp
+++ b/csrc/scale_check_overflow.cpp
 #include <torch/extension.h>

-void scale_check_overflow_cuda(const at::Tensor& d_grads, float scale, const at::Tensor& d_buf);
+void scale_check_overflow_cuda(const at::Tensor& grads,
+                               float scale,
+                               const at::Tensor& d_buf,
+                               const at::Tensor& downscaled_grads);

-void scale_check_overflow(at::Tensor grads, float scale, at::Tensor overflow_buf)
+void scale_check_overflow(at::Tensor grads,
+                          float scale, 
+                          at::Tensor overflow_buf,
+                          at::Tensor downscaled_grads)
+                          // const at::optional<at::Tensor> downscaled_grads)
 { 
  AT_CHECK(grads.type().is_cuda(), "grads must be a CUDA tensor");
  AT_CHECK(grads.is_contiguous(), "grads must be contiguous");
  AT_CHECK(overflow_buf.type().is_cuda(), "overflow_buf must be a CUDA tensor");
  AT_CHECK(overflow_buf.is_contiguous(), "overflow_buf must be contiguous");
+  AT_CHECK(downscaled_grads.type().is_cuda(), "downscaled_grads must be a CUDA tensor");
+  AT_CHECK(downscaled_grads.is_contiguous(), "downscaled_grads must be contiguous");
  // Make sure we are downscaling the FP32 master grads
-  AT_CHECK(grads.type().scalarType() == at::ScalarType::Float, 
-    "grads supplied to scale_check_overflow should be fp32 (master grads).")
+  AT_CHECK(downscaled_grads.type().scalarType() == at::ScalarType::Float, 
+    "The output grads supplied to scale_check_overflow should be fp32 (master grads).")

-  scale_check_overflow_cuda(grads, scale, overflow_buf);
+  scale_check_overflow_cuda(grads, scale, overflow_buf, downscaled_grads);
 }

 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {

--- a/csrc/scale_check_overflow_kernel.cu
+++ b/csrc/scale_check_overflow_kernel.cu
@@ -4,101 +4,76 @@
 #include <ATen/cuda/Exceptions.h>

 #include <assert.h>
+#include <cuda_runtime.h>

 #define BLOCK_SIZE 1024
 #define MAX_BLOCKS 1024

-// It makes sense to lock the type to "float" here because the downscaling
-// should only be applied to the FP32 master gradients.  Also, if "in" were 
-// a different type, it would require divergent code for the vectorized load logic.
+// It makes sense to lock the output type to fp32 because the downscaled
+// grads should be master grads (and in the case of Amp, the params and their
+// gradients should always be fp32.

-// TODO:  
-// Update overflow check to use reduction from kernel_utils.cuh with 
-// ReduceOp from THCTensorMathReduce.cuh.
-__global__ void scale_reduce_overflow
-  (float *in, 
-   size_t n, 
-   float scale,
-   uint8_t *overflow_out) 
+template<typename in_t>
+__global__ void scale_reduce_overflow(in_t* in,
+                                      float* out, 
+                                      size_t n, 
+                                      float scale,
+                                      volatile int* overflow_global) 
 {
-    __shared__ uint8_t cta_overflow[BLOCK_SIZE];
+  __shared__ int overflow;

-    int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    int stride = gridDim.x * blockDim.x;
+  int tid = blockIdx.x*blockDim.x + threadIdx.x;
+  int stride = gridDim.x*blockDim.x;

-    uint8_t my_overflow = 0;
-    for (int i = tid * 4; i < n; i+= stride * 4) {
-        if (i < (n - 3)) {
-            float4 f4 = ((float4*)in)[i / 4];
-            if (isfinite(f4.x)) {
-                f4.x *= scale;
-            } else {
-                my_overflow = 1;
-            }
-            if (isfinite(f4.y)) {
-                f4.y *= scale;
-            } else {
-                my_overflow = 1;
-            }
-            if (isfinite(f4.z)) {
-                f4.z *= scale;
-            } else {
-                my_overflow = 1;
-            }
-            if (isfinite(f4.w)) {
-                f4.w *= scale;
-            } else {
-                my_overflow = 1;
-            }
-            ((float4*)in)[i / 4] = f4;
-        } else {
-            for (; i < n; ++i) {
-                if (isfinite(in[i])) {
-                    in[i] *= scale;
-                } else {
-                    my_overflow = 1;
-                }
-            }
-        }
-    }
+  // Non-divergent exit condition for the __syncthreads
+  for(int i = tid; i - threadIdx.x < n; i += stride)
+  {
+    if(threadIdx.x == 0)
+      overflow = *overflow_global;

-    int tIdx = threadIdx.x;
-    cta_overflow[tIdx] = my_overflow;
    __syncthreads();

-    int participating = BLOCK_SIZE / 2;
-    while (participating > 0) {
-        if (tIdx < participating) {
-            cta_overflow[tIdx] = max(cta_overflow[tIdx],
-                                     cta_overflow[tIdx + participating]);
-        }
-        participating /= 2;
-        __syncthreads();
-    }
-    if (tIdx == 0) {
-        overflow_out[blockIdx.x] = max(cta_overflow[0],
-                                       overflow_out[blockIdx.x]);
-    }
+    if(overflow == 1)
+      break;
+    
+    if(tid < n)
+    {
+      float incoming_val = static_cast<float>(in[i]);
+      if(isfinite(incoming_val))
+        out[i] = incoming_val*scale;
+      else
+        *overflow_global = 1; // Blindly fire off a write.  These will race but that's ok.
+    } 
+  }  
 }

+
 void scale_check_overflow_cuda
-  (const at::Tensor& d_grads, 
+  (const at::Tensor& grads, 
   float scale,
-   const at::Tensor& d_buf) 
+   const at::Tensor& overflow_buf,
+   const at::Tensor& downscaled_grads) 
 {
  using namespace at;
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  
-  size_t n = d_grads.numel();
-  size_t buf_n = d_buf.numel();
+  size_t n = grads.numel();
+
+  int num_blks = 160;
+ 
+  // Lock the output (downscaled) type to float.
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(grads.type(),
+     "scale_check_overflow_cuda",
+     [&]
+     {
+       // using accscalar_t = acc_type<scalar_t, true>;
+       scale_reduce_overflow<<<num_blks, BLOCK_SIZE, 0, stream>>>
+         (grads.data<scalar_t>(), 
+          downscaled_grads.data<float>(),
+          n, 
+          scale, 
+          overflow_buf.data<int>());
+     });

-  int num_blks = min((int(n) + BLOCK_SIZE - 1) / BLOCK_SIZE,
-                     MAX_BLOCKS);
-  assert(buf_n >= num_blks);
-  scale_reduce_overflow<<<num_blks, BLOCK_SIZE, 0, stream>>>
-    (d_grads.data<float>(), 
-     n, 
-     scale, 
-     d_buf.data<uint8_t>());
  AT_CUDA_CHECK(cudaGetLastError());
 }