enable wider load/store for multi_tensor_apply kernels (#763)

* modify MTA axpby for wider load/store * Make scale/axpby/l2/adam/lamb multi_tensor uses wider load

enable wider load/store for multi_tensor_apply kernels (#763)
* modify MTA axpby for wider load/store * Make scale/axpby/l2/adam/lamb multi_tensor uses wider load
17ee854e · Deyu Fu · GitHub · 31aceeaa · 17ee854e · 17ee854e
Unverified Commit 17ee854e authored Apr 30, 2020 by Deyu Fu Committed by GitHub Apr 30, 2020
5 changed files
--- a/apex/contrib/csrc/optimizers/fused_adam_cuda_kernel.cu
+++ b/apex/contrib/csrc/optimizers/fused_adam_cuda_kernel.cu
@@ -14,6 +14,17 @@
 #define BLOCK_SIZE 512
 #define ILP 4

+template<typename T>
+__device__ __forceinline__ bool is_aligned(T* p){
+  return ((uint64_t)p) % (ILP*sizeof(T)) == 0;
+}
+
+template<typename T>
+__device__ __forceinline__ void load_store(T* dst, T* src, int dst_offset, int src_offset){
+  typedef typename std::aligned_storage<ILP*sizeof(T), ILP*alignof(T)>::type LT;
+  ((LT*)dst)[dst_offset] = ((LT*)src)[src_offset];
+}
+
 #include "type_shim.h"

 typedef enum{
@@ -99,11 +110,51 @@ struct AdamFunctor
        T incoming_v[ILP];
        T incoming_g[ILP];

+        // to make things simple, we put aligned case in a different code path
+        if(n % ILP == 0 &&
+           chunk_size % ILP == 0 &&
+           is_aligned(p) &&
+           is_aligned(m) &&
+           is_aligned(v) &&
+           is_aligned(g) &&
+           is_aligned(p_copy))
+        {
+          for(int i_start = threadIdx.x; i_start*ILP < n && i_start*ILP < chunk_size; i_start += blockDim.x)
+          {
+            // load
+            GRAD_T tmp_g[ILP];
+            load_store(incoming_p, p, 0, i_start);
+            load_store(incoming_m, m, 0, i_start);
+            load_store(incoming_v, v, 0, i_start);
+            load_store(tmp_g, g, 0, i_start);
+#pragma unroll
+            for(int ii = 0; ii < ILP; ii++) {
+              incoming_g[ii] = static_cast<T>(tmp_g[ii]);
+              T scaled_grad = incoming_g[ii]/grad_scale;
+              incoming_m[ii] = b1*incoming_m[ii] + (1-b1)*scaled_grad;
+              incoming_v[ii] = b2*incoming_v[ii] + (1-b2)*scaled_grad*scaled_grad;
+              float denom;
+              if (mode == ADAM_MODE_0)
+                denom = sqrtf(incoming_v[ii] + eps);
+              else // Mode 1
+                denom = sqrtf(incoming_v[ii]) + eps;
+              float update = (incoming_m[ii]/denom) + (decay*incoming_p[ii]);
+              incoming_p[ii] = incoming_p[ii] - (step_size*update);
+              if (DEPTH == 5)  tmp_g[ii] = static_cast<GRAD_T>(incoming_p[ii]);
+            }
+            load_store(p, incoming_p, i_start, 0);
+            load_store(m, incoming_m, i_start, 0);
+            load_store(v, incoming_v, i_start, 0);
+            if (DEPTH == 5) load_store(p_copy, tmp_g, i_start, 0);
+          }
+        }
+        else
+        {
          for(int i_start = 0;
              i_start < n && i_start < chunk_size;
              i_start += blockDim.x*ILP) {

-            #pragma unroll
+#pragma unroll
            for(int ii = 0; ii < ILP; ii++) {
              incoming_p[ii] = 0;
              incoming_m[ii] = 0;
@@ -124,7 +175,7 @@ struct AdamFunctor
            // the write loop, since writes just fire off once their LDGs arrive.
            // Put another way, the STGs are dependent on the LDGs, but not on each other.
            // There is still compute ILP benefit from unrolling the loop though.
-            #pragma unroll
+#pragma unroll
            for(int ii = 0; ii < ILP; ii++) {
              int j = i_start + threadIdx.x + ii*blockDim.x;

@@ -144,6 +195,7 @@ struct AdamFunctor
            }
          }
        }
+    }
 };

 void fused_adam_cuda(
@@ -332,4 +384,3 @@ void fused_adam_cuda_mt(
    }
    THCudaCheck(cudaGetLastError());
 }
-
--- a/csrc/multi_tensor_axpby_kernel.cu
+++ b/csrc/multi_tensor_axpby_kernel.cu
@@ -13,6 +13,17 @@
 #define BLOCK_SIZE 512
 #define ILP 4

+template<typename T>
+__device__ __forceinline__ bool is_aligned(T* p){
+  return ((uint64_t)p) % (ILP*sizeof(T)) == 0;
+}
+
+template<typename T>
+__device__ __forceinline__ void load_store(T* dst, T* src, int dst_offset, int src_offset){
+  typedef typename std::aligned_storage<ILP*sizeof(T), ILP*alignof(T)>::type LT;
+  ((LT*)dst)[dst_offset] = ((LT*)src)[src_offset];
+}
+
 template<typename x_t, typename y_t, typename out_t>
 struct AxpbyFunctor
 {
@@ -43,46 +54,74 @@ struct AxpbyFunctor

    n -= chunk_idx*chunk_size;

+    bool finite = true;
+    x_t r_x[ILP];
+    y_t r_y[ILP];
+    out_t r_out[ILP];
+
+    // to make things simple, we put aligned case in a different code path
+    if(n % ILP == 0 && chunk_size % ILP == 0 && is_aligned(x) && is_aligned(y) && is_aligned(out))
+    {
+      for(int i_start = threadIdx.x; i_start*ILP < n && i_start*ILP < chunk_size; i_start += blockDim.x)
+      {
+        // load
+        load_store(r_x, x, 0 , i_start);
+        load_store(r_y, y, 0 , i_start);
+#pragma unroll
+        for(int ii = 0; ii < ILP; ii++)
+        {
+          r_out[ii] = a*static_cast<float>(r_x[ii]) + b*static_cast<float>(r_y[ii]);
+          if(arg_to_check == -1)
+            finite = finite && (isfinite(r_x[ii]) && isfinite(r_y[ii]));
+          if(arg_to_check == 0)
+            finite = finite && isfinite(r_x[ii]);
+          if(arg_to_check == 1)
+            finite = finite && isfinite(r_y[ii]);
+        }
+        // store
+        load_store(out, r_out, i_start , 0);
+      }
+    }
+    else
+    {
      // Non-divergent exit condition for __syncthreads, not necessary here
-    float xs[ILP];
-    float ys[ILP];
-    for(int i_start = 0;
-        i_start < n && i_start < chunk_size;
-        i_start += blockDim.x*ILP)
+      for(int i_start = 0; i_start < n && i_start < chunk_size; i_start += blockDim.x*ILP)
      {
-      #pragma unroll
+#pragma unroll
        for(int ii = 0; ii < ILP; ii++)
        {
-        xs[ii] = 0;
-        ys[ii] = 0;
+          r_x[ii] = 0;
+          r_y[ii] = 0;
          int i = i_start + threadIdx.x + ii*blockDim.x;
          if(i < n && i < chunk_size)
          {
-          xs[ii] = static_cast<float>(x[i]);
-          ys[ii] = static_cast<float>(y[i]);
+            r_x[ii] = x[i];
+            r_y[ii] = y[i];
          }
        }
-
-      // see note in multi_tensor_scale_kernel.cu
-      #pragma unroll
+#pragma unroll
        for(int ii = 0; ii < ILP; ii++)
        {
-        int i = i_start + threadIdx.x + ii*blockDim.x;
-        if(i < n && i < chunk_size)
-        {
-          out[i] = static_cast<out_t>(a*xs[ii] + b*ys[ii]);
-          bool finite = true;
+          r_out[ii] = a*static_cast<float>(r_x[ii]) + b*static_cast<float>(r_y[ii]);
          if(arg_to_check == -1)
-            finite = (isfinite(xs[ii]) && isfinite(ys[ii]));
+            finite = finite && (isfinite(r_x[ii]) && isfinite(r_y[ii]));
          if(arg_to_check == 0)
-            finite = isfinite(xs[ii]);
+            finite = finite && isfinite(r_x[ii]);
          if(arg_to_check == 1)
-            finite = isfinite(ys[ii]);
-          if(!finite)
-            *noop_gmem = 1; // Blindly fire off a write.  These will race but that's ok.
+            finite = finite && isfinite(r_y[ii]);
+        }
+        // see note in multi_tensor_scale_kernel.cu
+#pragma unroll
+        for(int ii = 0; ii < ILP; ii++)
+        {
+          int i = i_start + threadIdx.x + ii*blockDim.x;
+          if(i < n && i < chunk_size)
+            out[i] = r_out[ii];
        }
      }
    }
+    if(!finite)
+      *noop_gmem = 1; // Blindly fire off a write.  These will race but that's ok.
  }
 };


--- a/csrc/multi_tensor_l2norm_kernel.cu
+++ b/csrc/multi_tensor_l2norm_kernel.cu
@@ -13,6 +13,17 @@
 #define BLOCK_SIZE 512
 #define ILP 4

+template<typename T>
+__device__ __forceinline__ bool is_aligned(T* p){
+  return ((uint64_t)p) % (ILP*sizeof(T)) == 0;
+}
+
+template<typename T>
+__device__ __forceinline__ void load_store(T* dst, T* src, int dst_offset, int src_offset){
+  typedef typename std::aligned_storage<ILP*sizeof(T), ILP*alignof(T)>::type LT;
+  ((LT*)dst)[dst_offset] = ((LT*)src)[src_offset];
+}
+
 template<typename x_t>
 struct L2NormFunctor
 {
@@ -41,12 +52,33 @@ struct L2NormFunctor
    __shared__ float s_vals[512];

    float vals[ILP]; // = {0}; // this probably works too but I want to be sure...
+    x_t r_x[ILP];
    for(int i = 0; i < ILP; i++)
+    {
      vals[i] = 0.f;
+      r_x[i] = 0;
+    }

+    // to make things simple, we put aligned case in a different code path
+    if(n % ILP == 0 && chunk_size % ILP == 0 && is_aligned(x))
+    {
+      for(int i_start = threadIdx.x; i_start*ILP < n && i_start*ILP < chunk_size; i_start += blockDim.x)
+      {
+        // load
+        load_store(r_x, x, 0 , i_start);
+#pragma unroll
+        for(int ii = 0; ii < ILP; ii++)
+        {
+          float next = static_cast<float>(r_x[ii]);
+          vals[ii] += next*next;
+        }
+      }
+    }
+    else
+    {
      for(int i_start = 0; i_start < n && i_start < chunk_size; i_start += blockDim.x*ILP)
      {
-      #pragma unroll
+#pragma unroll
        for(int ii = 0; ii < ILP; ii++)
        {
          int i = i_start + threadIdx.x + ii*blockDim.x;
@@ -57,6 +89,7 @@ struct L2NormFunctor
          }
        }
      }
+    }

    float val = 0.f;
    for(int i = 0; i < ILP; i++)
@@ -104,12 +137,33 @@ struct MaxNormFunctor
    __shared__ float s_vals[512];

    float vals[ILP]; // = {0}; // this probably works too but I want to be sure...
+    x_t r_x[ILP];
    for(int i = 0; i < ILP; i++)
+    {
      vals[i] = 0.f;
+      r_x[i] = 0;
+    }

+    // to make things simple, we put aligned case in a different code path
+    if(n % ILP == 0 && chunk_size % ILP == 0 && is_aligned(x))
+    {
+      for(int i_start = threadIdx.x; i_start*ILP < n && i_start*ILP < chunk_size; i_start += blockDim.x)
+      {
+        // load
+        load_store(r_x, x, 0 , i_start);
+#pragma unroll
+        for(int ii = 0; ii < ILP; ii++)
+        {
+          float next = static_cast<float>(r_x[ii]);
+          vals[ii] = fmaxf(fabsf(vals[ii]), fabsf(next));
+        }
+      }
+    }
+    else
+    {
      for(int i_start = 0; i_start < n && i_start < chunk_size; i_start += blockDim.x*ILP)
      {
-      #pragma unroll
+#pragma unroll
        for(int ii = 0; ii < ILP; ii++)
        {
          int i = i_start + threadIdx.x + ii*blockDim.x;
@@ -120,6 +174,7 @@ struct MaxNormFunctor
          }
        }
      }
+    }

    float val = 0.f;
    for(int i = 0; i < ILP; i++)

--- a/csrc/multi_tensor_lamb.cu
+++ b/csrc/multi_tensor_lamb.cu
@@ -13,6 +13,17 @@
 #define BLOCK_SIZE 512
 #define ILP 4

+template<typename T>
+__device__ __forceinline__ bool is_aligned(T* p){
+  return ((uint64_t)p) % (ILP*sizeof(T)) == 0;
+}
+
+template<typename T>
+__device__ __forceinline__ void load_store(T* dst, T* src, int dst_offset, int src_offset){
+  typedef typename std::aligned_storage<ILP*sizeof(T), ILP*alignof(T)>::type LT;
+  ((LT*)dst)[dst_offset] = ((LT*)src)[src_offset];
+}
+
 typedef enum{
  MOMENT_MODE_0   =0, // L2 regularization mode
  MOMENT_MODE_1   =1  // Decoupled weight decay mode
@@ -68,6 +79,83 @@ struct LAMBStage1Functor

    n -= chunk_idx*chunk_size;

+    MATH_T r_g[ILP];
+    MATH_T r_p[ILP];
+    MATH_T r_m[ILP];
+    MATH_T r_v[ILP];
+    // to make things simple, we put aligned case in a different code path
+    if(n % ILP == 0 &&
+       chunk_size % ILP == 0 &&
+       is_aligned(g) &&
+       is_aligned(p) &&
+       is_aligned(m) &&
+       is_aligned(v))
+    {
+      T l_g[ILP];
+      T l_p[ILP];
+      T l_m[ILP];
+      T l_v[ILP];
+      for(int i_start = threadIdx.x; i_start*ILP < n && i_start*ILP < chunk_size; i_start += blockDim.x)
+      {
+        // load
+        load_store(l_g, g, 0, i_start);
+        if (decay != 0)
+          load_store(l_p, p, 0, i_start);
+        load_store(l_m, m, 0, i_start);
+        load_store(l_v, v, 0, i_start);
+        // unpack
+#pragma unroll
+        for(int ii = 0; ii < ILP; ii++)
+        {
+          r_g[ii] = l_g[ii];
+          if (decay == 0) {
+            r_p[ii] = MATH_T(0);
+          }
+          else {
+            r_p[ii] = l_p[ii];
+          }
+          r_m[ii] = l_m[ii];
+          r_v[ii] = l_v[ii];
+        }
+#pragma unroll
+        for(int ii = 0; ii < ILP; ii++)
+        {
+          if (mode == MOMENT_MODE_0) {
+            MATH_T scaled_grad = r_g[ii] / clipped_global_grad_norm;
+            // L2 on scaled grad
+            scaled_grad = scaled_grad + decay*r_p[ii];
+            r_m[ii] = r_m[ii] * beta1 + beta3 * scaled_grad;
+            r_v[ii] = r_v[ii] * beta2 + (1-beta2) * scaled_grad * scaled_grad;
+            MATH_T next_m_unbiased = r_m[ii] / beta1_correction;
+            MATH_T next_v_unbiased = r_v[ii] / beta2_correction;
+            MATH_T denom = sqrtf(next_v_unbiased) + epsilon;
+            r_p[ii] = next_m_unbiased / denom;
+          }
+          else {
+            MATH_T scaled_grad = r_g[ii] / clipped_global_grad_norm;
+            r_m[ii] = r_m[ii] * beta1 + beta3 * scaled_grad;
+            r_v[ii] = r_v[ii] * beta2 + (1-beta2) * scaled_grad * scaled_grad;
+            MATH_T next_m_unbiased = r_m[ii] / beta1_correction;
+            MATH_T next_v_unbiased = r_v[ii] / beta2_correction;
+            MATH_T denom = sqrtf(next_v_unbiased) + epsilon;
+            r_p[ii] = (next_m_unbiased/denom) + (decay*r_p[ii]);
+          }
+        }
+#pragma unroll
+        for(int ii = 0; ii < ILP; ii++)
+        {
+          l_p[ii] = r_p[ii];
+          l_m[ii] = r_m[ii];
+          l_v[ii] = r_v[ii];
+        }
+        // store
+        load_store(g, l_p, i_start, 0);
+        load_store(m, l_m, i_start, 0);
+        load_store(v, l_v, i_start, 0);
+      }
+    }
+    else
+    {
      // see note in multi_tensor_scale_kernel.cu
      for(int i_start = 0;
          i_start < n && i_start < chunk_size;
@@ -137,6 +225,7 @@ struct LAMBStage1Functor
        }
      }
    }
+  }
 };

 // Step 2 reads in 'update' value and per-tensor param_norm and update_norm.
@@ -173,6 +262,29 @@ struct LAMBStage2Functor

    n -= chunk_idx*chunk_size;

+    // to make things simple, we put aligned case in a different code path
+    if(n % ILP == 0 &&
+       chunk_size % ILP == 0 &&
+       is_aligned(p) &&
+       is_aligned(update))
+    {
+      T r_p[ILP];
+      T r_update[ILP];
+      for(int i_start = threadIdx.x; i_start*ILP < n && i_start*ILP < chunk_size; i_start += blockDim.x)
+      {
+        // load
+        load_store(r_p, p, 0, i_start);
+        load_store(r_update, update, 0, i_start);
+#pragma unroll
+        for(int ii = 0; ii < ILP; ii++)
+        {
+          r_p[ii] = static_cast<MATH_T>(r_p[ii]) - (ratio * static_cast<MATH_T>(r_update[ii]));
+        }
+        load_store(p, r_p, i_start, 0);
+      }
+    }
+    else
+    {
      for(int i_start = 0;
          i_start < n && i_start < chunk_size;
          i_start += blockDim.x*ILP)
@@ -205,6 +317,7 @@ struct LAMBStage2Functor
        }
      }
    }
+  }
 };



--- a/csrc/multi_tensor_scale_kernel.cu
+++ b/csrc/multi_tensor_scale_kernel.cu
@@ -15,6 +15,17 @@
 #define BLOCK_SIZE 512
 #define ILP 4

+template<typename T>
+__device__ __forceinline__ bool is_aligned(T* p){
+  return ((uint64_t)p) % (ILP*sizeof(T)) == 0;
+}
+
+template<typename T>
+__device__ __forceinline__ void load_store(T* dst, T* src, int dst_offset, int src_offset){
+  typedef typename std::aligned_storage<ILP*sizeof(T), ILP*alignof(T)>::type LT;
+  ((LT*)dst)[dst_offset] = ((LT*)src)[src_offset];
+}
+
 template<typename in_t, typename out_t>
 struct ScaleFunctor
 {
@@ -40,38 +51,62 @@ struct ScaleFunctor

    n -= chunk_idx*chunk_size;

+    bool finite = true;
+    in_t r_in[ILP];
+    out_t r_out[ILP];
+
+    // to make things simple, we put aligned case in a different code path
+    if(n % ILP == 0 && chunk_size % ILP == 0 && is_aligned(in) && is_aligned(out))
+    {
+      for(int i_start = threadIdx.x; i_start*ILP < n && i_start*ILP < chunk_size; i_start += blockDim.x)
+      {
+        // load
+        load_store(r_in, in, 0 , i_start);
+#pragma unroll
+        for(int ii = 0; ii < ILP; ii++)
+        {
+          r_out[ii] = static_cast<float>(r_in[ii]) * scale;
+          finite = finite && isfinite(r_in[ii]);
+        }
+        // store
+        load_store(out, r_out, i_start, 0);
+      }
+    }
+    else
+    {
      // Non-divergent exit condition for __syncthreads, not necessary here
-    float incoming_vals[ILP];
-    for(int i_start = 0;
-        i_start < n && i_start < chunk_size;
-        i_start += blockDim.x*ILP)
+      for(int i_start = 0; i_start < n && i_start < chunk_size; i_start += blockDim.x*ILP)
      {
-      #pragma unroll
+#pragma unroll
        for(int ii = 0; ii < ILP; ii++)
        {
-        incoming_vals[ii] = 0;
+          r_in[ii] = 0;
          int i = i_start + threadIdx.x + ii*blockDim.x;
          if(i < n && i < chunk_size)
-          incoming_vals[ii] = static_cast<float>(in[i]);
+            r_in[ii] = in[i];
        }
-
        // note for clarification to future michael:
        // From a pure memory dependency perspective, there's likely no point unrolling
        // the write loop, since writes just fire off once their LDGs arrive.
        // Put another way, the STGs are dependent on the LDGs, but not on each other.
        // There is still compute ILP benefit from unrolling the loop though.
-      #pragma unroll
+#pragma unroll
+        for(int ii = 0; ii < ILP; ii++)
+        {
+          r_out[ii] = static_cast<float>(r_in[ii]) * scale;
+          finite = finite && isfinite(r_in[ii]);
+        }
+#pragma unroll
        for(int ii = 0; ii < ILP; ii++)
        {
          int i = i_start + threadIdx.x + ii*blockDim.x;
          if(i < n && i < chunk_size)
-        {
-          out[i] = static_cast<out_t>(incoming_vals[ii]*scale);
-          if(!isfinite(incoming_vals[ii]))
-            *noop_gmem = 1; // Blindly fire off a write.  These will race but that's ok.
+            out[i] = r_out[ii];
        }
      }
    }
+    if(!finite)
+      *noop_gmem = 1; // Blindly fire off a write.  These will race but that's ok.
  }
 };