Merge

0bfb8300 · Thor Johnsen · 2619f1cb · cf50dc7c · 0bfb8300 · 0bfb8300
Commit 0bfb8300 authored May 08, 2020 by Thor Johnsen
9 changed files
--- a/csrc/multi_tensor_l2norm_kernel.cu
+++ b/csrc/multi_tensor_l2norm_kernel.cu
@@ -13,6 +13,17 @@
 #define BLOCK_SIZE 512
 #define ILP 4

+template<typename T>
+__device__ __forceinline__ bool is_aligned(T* p){
+  return ((uint64_t)p) % (ILP*sizeof(T)) == 0;
+}
+
+template<typename T>
+__device__ __forceinline__ void load_store(T* dst, T* src, int dst_offset, int src_offset){
+  typedef typename std::aligned_storage<ILP*sizeof(T), ILP*alignof(T)>::type LT;
+  ((LT*)dst)[dst_offset] = ((LT*)src)[src_offset];
+}
+
 template<typename x_t>
 struct L2NormFunctor
 {
@@ -41,22 +52,44 @@ struct L2NormFunctor
    __shared__ float s_vals[512];

    float vals[ILP]; // = {0}; // this probably works too but I want to be sure...
+    x_t r_x[ILP];
    for(int i = 0; i < ILP; i++)
+    {
      vals[i] = 0.f;
+      r_x[i] = 0;
+    }

-    for(int i_start = 0; i_start < n && i_start < chunk_size; i_start += blockDim.x*ILP)
+    // to make things simple, we put aligned case in a different code path
+    if(n % ILP == 0 && chunk_size % ILP == 0 && is_aligned(x))
    {
-      #pragma unroll
-      for(int ii = 0; ii < ILP; ii++)
+      for(int i_start = threadIdx.x; i_start*ILP < n && i_start*ILP < chunk_size; i_start += blockDim.x)
      {
-        int i = i_start + threadIdx.x + ii*blockDim.x;
-        if(i < n && i < chunk_size)
+        // load
+        load_store(r_x, x, 0 , i_start);
+#pragma unroll
+        for(int ii = 0; ii < ILP; ii++)
        {
-          float next = static_cast<float>(x[i]);
+          float next = static_cast<float>(r_x[ii]);
          vals[ii] += next*next;
        }
      }
    }
+    else
+    {
+      for(int i_start = 0; i_start < n && i_start < chunk_size; i_start += blockDim.x*ILP)
+      {
+#pragma unroll
+        for(int ii = 0; ii < ILP; ii++)
+        {
+          int i = i_start + threadIdx.x + ii*blockDim.x;
+          if(i < n && i < chunk_size)
+          {
+            float next = static_cast<float>(x[i]);
+            vals[ii] += next*next;
+          }
+        }
+      }
+    }

    float val = 0.f;
    for(int i = 0; i < ILP; i++)
@@ -104,22 +137,44 @@ struct MaxNormFunctor
    __shared__ float s_vals[512];

    float vals[ILP]; // = {0}; // this probably works too but I want to be sure...
+    x_t r_x[ILP];
    for(int i = 0; i < ILP; i++)
+    {
      vals[i] = 0.f;
+      r_x[i] = 0;
+    }

-    for(int i_start = 0; i_start < n && i_start < chunk_size; i_start += blockDim.x*ILP)
+    // to make things simple, we put aligned case in a different code path
+    if(n % ILP == 0 && chunk_size % ILP == 0 && is_aligned(x))
    {
-      #pragma unroll
-      for(int ii = 0; ii < ILP; ii++)
+      for(int i_start = threadIdx.x; i_start*ILP < n && i_start*ILP < chunk_size; i_start += blockDim.x)
      {
-        int i = i_start + threadIdx.x + ii*blockDim.x;
-        if(i < n && i < chunk_size)
+        // load
+        load_store(r_x, x, 0 , i_start);
+#pragma unroll
+        for(int ii = 0; ii < ILP; ii++)
        {
-          float next = static_cast<float>(x[i]);
+          float next = static_cast<float>(r_x[ii]);
          vals[ii] = fmaxf(fabsf(vals[ii]), fabsf(next));
        }
      }
    }
+    else
+    {
+      for(int i_start = 0; i_start < n && i_start < chunk_size; i_start += blockDim.x*ILP)
+      {
+#pragma unroll
+        for(int ii = 0; ii < ILP; ii++)
+        {
+          int i = i_start + threadIdx.x + ii*blockDim.x;
+          if(i < n && i < chunk_size)
+          {
+            float next = static_cast<float>(x[i]);
+            vals[ii] = fmaxf(fabsf(vals[ii]), fabsf(next));
+          }
+        }
+      }
+    }

    float val = 0.f;
    for(int i = 0; i < ILP; i++)

--- a/csrc/multi_tensor_lamb.cu
+++ b/csrc/multi_tensor_lamb.cu
@@ -13,6 +13,17 @@
 #define BLOCK_SIZE 512
 #define ILP 4

+template<typename T>
+__device__ __forceinline__ bool is_aligned(T* p){
+  return ((uint64_t)p) % (ILP*sizeof(T)) == 0;
+}
+
+template<typename T>
+__device__ __forceinline__ void load_store(T* dst, T* src, int dst_offset, int src_offset){
+  typedef typename std::aligned_storage<ILP*sizeof(T), ILP*alignof(T)>::type LT;
+  ((LT*)dst)[dst_offset] = ((LT*)src)[src_offset];
+}
+
 typedef enum{
  MOMENT_MODE_0   =0, // L2 regularization mode
  MOMENT_MODE_1   =1  // Decoupled weight decay mode
@@ -68,71 +79,149 @@ struct LAMBStage1Functor

    n -= chunk_idx*chunk_size;

-    // see note in multi_tensor_scale_kernel.cu
-    for(int i_start = 0;
-            i_start < n && i_start < chunk_size;
-            i_start += blockDim.x*ILP)
+    MATH_T r_g[ILP];
+    MATH_T r_p[ILP];
+    MATH_T r_m[ILP];
+    MATH_T r_v[ILP];
+    // to make things simple, we put aligned case in a different code path
+    if(n % ILP == 0 &&
+       chunk_size % ILP == 0 &&
+       is_aligned(g) &&
+       is_aligned(p) &&
+       is_aligned(m) &&
+       is_aligned(v))
    {
-      MATH_T r_g[ILP];
-      MATH_T r_p[ILP];
-      MATH_T r_m[ILP];
-      MATH_T r_v[ILP];
-#pragma unroll
-      for(int ii = 0; ii < ILP; ii++)
+      T l_g[ILP];
+      T l_p[ILP];
+      T l_m[ILP];
+      T l_v[ILP];
+      for(int i_start = threadIdx.x; i_start*ILP < n && i_start*ILP < chunk_size; i_start += blockDim.x)
      {
-        int i = i_start + threadIdx.x + ii*blockDim.x;
-        if(i < n && i < chunk_size)
+        // load
+        load_store(l_g, g, 0, i_start);
+        if (decay != 0)
+          load_store(l_p, p, 0, i_start);
+        load_store(l_m, m, 0, i_start);
+        load_store(l_v, v, 0, i_start);
+        // unpack
+#pragma unroll
+        for(int ii = 0; ii < ILP; ii++)
        {
-          r_g[ii] = g[i];
-          // special ?optimization? for lamb stage 1
+          r_g[ii] = l_g[ii];
          if (decay == 0) {
            r_p[ii] = MATH_T(0);
          }
          else {
-            r_p[ii] = p[i];
+            r_p[ii] = l_p[ii];
          }
-          r_m[ii] = m[i];
-          r_v[ii] = v[i];
-        } else {
-          r_g[ii] = MATH_T(0);
-          r_p[ii] = MATH_T(0);
-          r_m[ii] = MATH_T(0);
-          r_v[ii] = MATH_T(0);
+          r_m[ii] = l_m[ii];
+          r_v[ii] = l_v[ii];
        }
-      }
 #pragma unroll
-      for(int ii = 0; ii < ILP; ii++)
-      {
-        if (mode == MOMENT_MODE_0) {
-          MATH_T scaled_grad = r_g[ii] / clipped_global_grad_norm;
-          // L2 on scaled grad
-          scaled_grad = scaled_grad + decay*r_p[ii];
-          r_m[ii] = r_m[ii] * beta1 + beta3 * scaled_grad;
-          r_v[ii] = r_v[ii] * beta2 + (1-beta2) * scaled_grad * scaled_grad;
-          MATH_T next_m_unbiased = r_m[ii] / beta1_correction;
-          MATH_T next_v_unbiased = r_v[ii] / beta2_correction;
-          MATH_T denom = sqrtf(next_v_unbiased) + epsilon;
-          r_p[ii] = next_m_unbiased / denom;
+        for(int ii = 0; ii < ILP; ii++)
+        {
+          if (mode == MOMENT_MODE_0) {
+            MATH_T scaled_grad = r_g[ii] / clipped_global_grad_norm;
+            // L2 on scaled grad
+            scaled_grad = scaled_grad + decay*r_p[ii];
+            r_m[ii] = r_m[ii] * beta1 + beta3 * scaled_grad;
+            r_v[ii] = r_v[ii] * beta2 + (1-beta2) * scaled_grad * scaled_grad;
+            MATH_T next_m_unbiased = r_m[ii] / beta1_correction;
+            MATH_T next_v_unbiased = r_v[ii] / beta2_correction;
+            MATH_T denom = sqrtf(next_v_unbiased) + epsilon;
+            r_p[ii] = next_m_unbiased / denom;
+          }
+          else {
+            MATH_T scaled_grad = r_g[ii] / clipped_global_grad_norm;
+            r_m[ii] = r_m[ii] * beta1 + beta3 * scaled_grad;
+            r_v[ii] = r_v[ii] * beta2 + (1-beta2) * scaled_grad * scaled_grad;
+            MATH_T next_m_unbiased = r_m[ii] / beta1_correction;
+            MATH_T next_v_unbiased = r_v[ii] / beta2_correction;
+            MATH_T denom = sqrtf(next_v_unbiased) + epsilon;
+            r_p[ii] = (next_m_unbiased/denom) + (decay*r_p[ii]);
+          }
        }
-        else {
-          MATH_T scaled_grad = r_g[ii] / clipped_global_grad_norm;
-          r_m[ii] = r_m[ii] * beta1 + beta3 * scaled_grad;
-          r_v[ii] = r_v[ii] * beta2 + (1-beta2) * scaled_grad * scaled_grad;
-          MATH_T next_m_unbiased = r_m[ii] / beta1_correction;
-          MATH_T next_v_unbiased = r_v[ii] / beta2_correction;
-          MATH_T denom = sqrtf(next_v_unbiased) + epsilon;
-          r_p[ii] = (next_m_unbiased/denom) + (decay*r_p[ii]);
+#pragma unroll
+        for(int ii = 0; ii < ILP; ii++)
+        {
+          l_p[ii] = r_p[ii];
+          l_m[ii] = r_m[ii];
+          l_v[ii] = r_v[ii];
        }
+        // store
+        load_store(g, l_p, i_start, 0);
+        load_store(m, l_m, i_start, 0);
+        load_store(v, l_v, i_start, 0);
      }
-#pragma unroll
-      for(int ii = 0; ii < ILP; ii++)
+    }
+    else
+    {
+      // see note in multi_tensor_scale_kernel.cu
+      for(int i_start = 0;
+          i_start < n && i_start < chunk_size;
+          i_start += blockDim.x*ILP)
      {
-        int i = i_start + threadIdx.x + ii*blockDim.x;
-        if(i < n && i < chunk_size)
+        MATH_T r_g[ILP];
+        MATH_T r_p[ILP];
+        MATH_T r_m[ILP];
+        MATH_T r_v[ILP];
+#pragma unroll
+        for(int ii = 0; ii < ILP; ii++)
        {
-          g[i] = r_p[ii];
-          m[i] = r_m[ii];
-          v[i] = r_v[ii];
+          int i = i_start + threadIdx.x + ii*blockDim.x;
+          if(i < n && i < chunk_size)
+          {
+            r_g[ii] = g[i];
+            // special ?optimization? for lamb stage 1
+            if (decay == 0) {
+              r_p[ii] = MATH_T(0);
+            }
+            else {
+              r_p[ii] = p[i];
+            }
+            r_m[ii] = m[i];
+            r_v[ii] = v[i];
+          } else {
+            r_g[ii] = MATH_T(0);
+            r_p[ii] = MATH_T(0);
+            r_m[ii] = MATH_T(0);
+            r_v[ii] = MATH_T(0);
+          }
+        }
+#pragma unroll
+        for(int ii = 0; ii < ILP; ii++)
+        {
+          if (mode == MOMENT_MODE_0) {
+            MATH_T scaled_grad = r_g[ii] / clipped_global_grad_norm;
+            // L2 on scaled grad
+            scaled_grad = scaled_grad + decay*r_p[ii];
+            r_m[ii] = r_m[ii] * beta1 + beta3 * scaled_grad;
+            r_v[ii] = r_v[ii] * beta2 + (1-beta2) * scaled_grad * scaled_grad;
+            MATH_T next_m_unbiased = r_m[ii] / beta1_correction;
+            MATH_T next_v_unbiased = r_v[ii] / beta2_correction;
+            MATH_T denom = sqrtf(next_v_unbiased) + epsilon;
+            r_p[ii] = next_m_unbiased / denom;
+          }
+          else {
+            MATH_T scaled_grad = r_g[ii] / clipped_global_grad_norm;
+            r_m[ii] = r_m[ii] * beta1 + beta3 * scaled_grad;
+            r_v[ii] = r_v[ii] * beta2 + (1-beta2) * scaled_grad * scaled_grad;
+            MATH_T next_m_unbiased = r_m[ii] / beta1_correction;
+            MATH_T next_v_unbiased = r_v[ii] / beta2_correction;
+            MATH_T denom = sqrtf(next_v_unbiased) + epsilon;
+            r_p[ii] = (next_m_unbiased/denom) + (decay*r_p[ii]);
+          }
+        }
+#pragma unroll
+        for(int ii = 0; ii < ILP; ii++)
+        {
+          int i = i_start + threadIdx.x + ii*blockDim.x;
+          if(i < n && i < chunk_size)
+          {
+            g[i] = r_p[ii];
+            m[i] = r_m[ii];
+            v[i] = r_v[ii];
+          }
        }
      }
    }
@@ -173,34 +262,58 @@ struct LAMBStage2Functor

    n -= chunk_idx*chunk_size;

-    for(int i_start = 0;
-            i_start < n && i_start < chunk_size;
-            i_start += blockDim.x*ILP)
+    // to make things simple, we put aligned case in a different code path
+    if(n % ILP == 0 &&
+       chunk_size % ILP == 0 &&
+       is_aligned(p) &&
+       is_aligned(update))
    {
-      MATH_T r_p[ILP];
-      MATH_T r_update[ILP];
-#pragma unroll
-      for(int ii = 0; ii < ILP; ii++)
+      T r_p[ILP];
+      T r_update[ILP];
+      for(int i_start = threadIdx.x; i_start*ILP < n && i_start*ILP < chunk_size; i_start += blockDim.x)
      {
-       	int i = i_start + threadIdx.x + ii*blockDim.x;
-        if(i < n && i < chunk_size)
+        // load
+        load_store(r_p, p, 0, i_start);
+        load_store(r_update, update, 0, i_start);
+#pragma unroll
+        for(int ii = 0; ii < ILP; ii++)
        {
-          r_p[ii] = p[i];
-          r_update[ii] = update[i];
+          r_p[ii] = static_cast<MATH_T>(r_p[ii]) - (ratio * static_cast<MATH_T>(r_update[ii]));
        }
+        load_store(p, r_p, i_start, 0);
      }
-#pragma unroll
-      for(int ii = 0; ii < ILP; ii++)
+    }
+    else
+    {
+      for(int i_start = 0;
+          i_start < n && i_start < chunk_size;
+          i_start += blockDim.x*ILP)
      {
-       	r_p[ii] = r_p[ii] - (ratio * r_update[ii]);
-      }
+        MATH_T r_p[ILP];
+        MATH_T r_update[ILP];
 #pragma unroll
-      for(int ii = 0; ii < ILP; ii++)
-      {
-        int i = i_start + threadIdx.x + ii*blockDim.x;
-        if(i < n && i < chunk_size)
+        for(int ii = 0; ii < ILP; ii++)
+        {
+          int i = i_start + threadIdx.x + ii*blockDim.x;
+          if(i < n && i < chunk_size)
+          {
+            r_p[ii] = p[i];
+            r_update[ii] = update[i];
+          }
+        }
+#pragma unroll
+        for(int ii = 0; ii < ILP; ii++)
+        {
+          r_p[ii] = r_p[ii] - (ratio * r_update[ii]);
+        }
+#pragma unroll
+        for(int ii = 0; ii < ILP; ii++)
        {
-          p[i] = r_p[ii];
+          int i = i_start + threadIdx.x + ii*blockDim.x;
+          if(i < n && i < chunk_size)
+          {
+            p[i] = r_p[ii];
+          }
        }
      }
    }

--- a/csrc/multi_tensor_scale_kernel.cu
+++ b/csrc/multi_tensor_scale_kernel.cu
@@ -15,6 +15,17 @@
 #define BLOCK_SIZE 512
 #define ILP 4

+template<typename T>
+__device__ __forceinline__ bool is_aligned(T* p){
+  return ((uint64_t)p) % (ILP*sizeof(T)) == 0;
+}
+
+template<typename T>
+__device__ __forceinline__ void load_store(T* dst, T* src, int dst_offset, int src_offset){
+  typedef typename std::aligned_storage<ILP*sizeof(T), ILP*alignof(T)>::type LT;
+  ((LT*)dst)[dst_offset] = ((LT*)src)[src_offset];
+}
+
 template<typename in_t, typename out_t>
 struct ScaleFunctor
 {
@@ -34,44 +45,68 @@ struct ScaleFunctor

    in_t* in = (in_t*)tl.addresses[0][tensor_loc];
    in += chunk_idx*chunk_size;
-   
+
    out_t* out = (out_t*)tl.addresses[1][tensor_loc];
    out += chunk_idx*chunk_size;

    n -= chunk_idx*chunk_size;

-    // Non-divergent exit condition for __syncthreads, not necessary here
-    float incoming_vals[ILP];
-    for(int i_start = 0;
-        i_start < n && i_start < chunk_size;
-        i_start += blockDim.x*ILP)
+    bool finite = true;
+    in_t r_in[ILP];
+    out_t r_out[ILP];
+
+    // to make things simple, we put aligned case in a different code path
+    if(n % ILP == 0 && chunk_size % ILP == 0 && is_aligned(in) && is_aligned(out))
    {
-      #pragma unroll
-      for(int ii = 0; ii < ILP; ii++)
+      for(int i_start = threadIdx.x; i_start*ILP < n && i_start*ILP < chunk_size; i_start += blockDim.x)
      {
-        incoming_vals[ii] = 0;
-        int i = i_start + threadIdx.x + ii*blockDim.x;
-        if(i < n && i < chunk_size)
-          incoming_vals[ii] = static_cast<float>(in[i]);
+        // load
+        load_store(r_in, in, 0 , i_start);
+#pragma unroll
+        for(int ii = 0; ii < ILP; ii++)
+        {
+          r_out[ii] = static_cast<float>(r_in[ii]) * scale;
+          finite = finite && isfinite(r_in[ii]);
+        }
+        // store
+        load_store(out, r_out, i_start, 0);
      }
-
-      // note for clarification to future michael:
-      // From a pure memory dependency perspective, there's likely no point unrolling
-      // the write loop, since writes just fire off once their LDGs arrive.
-      // Put another way, the STGs are dependent on the LDGs, but not on each other.
-      // There is still compute ILP benefit from unrolling the loop though.
-      #pragma unroll
-      for(int ii = 0; ii < ILP; ii++)
+    }
+    else
+    {
+      // Non-divergent exit condition for __syncthreads, not necessary here
+      for(int i_start = 0; i_start < n && i_start < chunk_size; i_start += blockDim.x*ILP)
      {
-        int i = i_start + threadIdx.x + ii*blockDim.x;
-        if(i < n && i < chunk_size)
+#pragma unroll
+        for(int ii = 0; ii < ILP; ii++)
+        {
+          r_in[ii] = 0;
+          int i = i_start + threadIdx.x + ii*blockDim.x;
+          if(i < n && i < chunk_size)
+            r_in[ii] = in[i];
+        }
+        // note for clarification to future michael:
+        // From a pure memory dependency perspective, there's likely no point unrolling
+        // the write loop, since writes just fire off once their LDGs arrive.
+        // Put another way, the STGs are dependent on the LDGs, but not on each other.
+        // There is still compute ILP benefit from unrolling the loop though.
+#pragma unroll
+        for(int ii = 0; ii < ILP; ii++)
+        {
+          r_out[ii] = static_cast<float>(r_in[ii]) * scale;
+          finite = finite && isfinite(r_in[ii]);
+        }
+#pragma unroll
+        for(int ii = 0; ii < ILP; ii++)
        {
-          out[i] = static_cast<out_t>(incoming_vals[ii]*scale);
-          if(!isfinite(incoming_vals[ii]))
-            *noop_gmem = 1; // Blindly fire off a write.  These will race but that's ok.
+          int i = i_start + threadIdx.x + ii*blockDim.x;
+          if(i < n && i < chunk_size)
+            out[i] = r_out[ii];
        }
      }
    }
+    if(!finite)
+      *noop_gmem = 1; // Blindly fire off a write.  These will race but that's ok.
  }
 };


--- a/setup.py
+++ b/setup.py
@@ -138,6 +138,13 @@ if "--cuda_ext" in sys.argv:
                                                      '-O3',
                                                      '--use_fast_math'] + version_dependent_macros}))

+        ext_modules.append(
+            CUDAExtension(name='mlp_cuda',
+                          sources=['csrc/mlp.cpp',
+                                   'csrc/mlp_cuda.cu'],
+                          extra_compile_args={'cxx': ['-O3'] + version_dependent_macros,
+                                              'nvcc':['-O3'] + version_dependent_macros}))
+
 if "--bnp" in sys.argv:
    from torch.utils.cpp_extension import CUDAExtension
    sys.argv.remove("--bnp")
@@ -198,6 +205,32 @@ if "--deprecated_fused_adam" in sys.argv:
                                              'nvcc':['-O3',
                                                      '--use_fast_math'] + version_dependent_macros}))

+if "--deprecated_fused_lamb" in sys.argv:
+    from torch.utils.cpp_extension import CUDAExtension
+    sys.argv.remove("--deprecated_fused_lamb")
+
+    from torch.utils.cpp_extension import BuildExtension
+    cmdclass['build_ext'] = BuildExtension
+
+    if torch.utils.cpp_extension.CUDA_HOME is None:
+        raise RuntimeError("--deprecated_fused_lamb was requested, but nvcc was not found.  Are you sure your environment has nvcc available?  If you're installing within a container from https://hub.docker.com/r/pytorch/pytorch, only images whose names contain 'devel' will provide nvcc.")
+    else:
+        ext_modules.append(
+            CUDAExtension(name='fused_lamb_cuda',
+                          sources=['apex/contrib/csrc/optimizers/fused_lamb_cuda.cpp',
+                                   'apex/contrib/csrc/optimizers/fused_lamb_cuda_kernel.cu',
+                                   'csrc/multi_tensor_l2norm_kernel.cu'],
+                          include_dirs=[os.path.join(this_dir, 'csrc')],
+                          extra_compile_args={'cxx': ['-O3',] + version_dependent_macros,
+                                              'nvcc':['-O3',
+                                                      '--use_fast_math'] + version_dependent_macros}))
+
+# Check, if ATen/CUDAGenerator.h is found, otherwise use the new ATen/CUDAGeneratorImpl.h, due to breaking change in https://github.com/pytorch/pytorch/pull/36026 
+generator_flag = []
+torch_dir = torch.__path__[0]
+if os.path.exists(os.path.join(torch_dir, 'include', 'ATen', 'CUDAGenerator.h')):
+    generator_flag = ['-DOLD_GENERATOR']
+
 if "--fast_multihead_attn" in sys.argv:
    from torch.utils.cpp_extension import CUDAExtension
    sys.argv.remove("--fast_multihead_attn")
@@ -213,7 +246,7 @@ if "--fast_multihead_attn" in sys.argv:
            CUDAExtension(name='fast_self_multihead_attn',
                          sources=['apex/contrib/csrc/multihead_attn/self_multihead_attn.cpp',
                                   'apex/contrib/csrc/multihead_attn/self_multihead_attn_cuda.cu'],
-                          extra_compile_args={'cxx': ['-O3',] + version_dependent_macros,
+                          extra_compile_args={'cxx': ['-O3',] + version_dependent_macros + generator_flag,
                                              'nvcc':['-O3',
                                                      '-gencode', 'arch=compute_70,code=sm_70',
                                                      '-I./apex/contrib/csrc/multihead_attn/cutlass/',
@@ -221,12 +254,12 @@ if "--fast_multihead_attn" in sys.argv:
                                                      '-U__CUDA_NO_HALF_CONVERSIONS__',
                                                      '--expt-relaxed-constexpr',
                                                      '--expt-extended-lambda',
-                                                      '--use_fast_math'] + version_dependent_macros}))
+                                                      '--use_fast_math'] + version_dependent_macros + generator_flag}))
        ext_modules.append(
            CUDAExtension(name='fast_self_multihead_attn_norm_add',
                          sources=['apex/contrib/csrc/multihead_attn/self_multihead_attn_norm_add.cpp',
                                   'apex/contrib/csrc/multihead_attn/self_multihead_attn_norm_add_cuda.cu'],
-                          extra_compile_args={'cxx': ['-O3',] + version_dependent_macros,
+                          extra_compile_args={'cxx': ['-O3',] + version_dependent_macros + generator_flag,
                                              'nvcc':['-O3',
                                                      '-gencode', 'arch=compute_70,code=sm_70',
                                                      '-I./apex/contrib/csrc/multihead_attn/cutlass/',
@@ -234,12 +267,12 @@ if "--fast_multihead_attn" in sys.argv:
                                                      '-U__CUDA_NO_HALF_CONVERSIONS__',
                                                      '--expt-relaxed-constexpr',
                                                      '--expt-extended-lambda',
-                                                      '--use_fast_math'] + version_dependent_macros}))
+                                                      '--use_fast_math'] + version_dependent_macros + generator_flag}))
        ext_modules.append(
            CUDAExtension(name='fast_encdec_multihead_attn',
                          sources=['apex/contrib/csrc/multihead_attn/encdec_multihead_attn.cpp',
                                   'apex/contrib/csrc/multihead_attn/encdec_multihead_attn_cuda.cu'],
-                          extra_compile_args={'cxx': ['-O3',] + version_dependent_macros,
+                          extra_compile_args={'cxx': ['-O3',] + version_dependent_macros + generator_flag,
                                              'nvcc':['-O3',
                                                      '-gencode', 'arch=compute_70,code=sm_70',
                                                      '-I./apex/contrib/csrc/multihead_attn/cutlass/',
@@ -247,12 +280,12 @@ if "--fast_multihead_attn" in sys.argv:
                                                      '-U__CUDA_NO_HALF_CONVERSIONS__',
                                                      '--expt-relaxed-constexpr',
                                                      '--expt-extended-lambda',
-                                                      '--use_fast_math'] + version_dependent_macros}))
+                                                      '--use_fast_math'] + version_dependent_macros + generator_flag}))
        ext_modules.append(
            CUDAExtension(name='fast_encdec_multihead_attn_norm_add',
                          sources=['apex/contrib/csrc/multihead_attn/encdec_multihead_attn_norm_add.cpp',
                                   'apex/contrib/csrc/multihead_attn/encdec_multihead_attn_norm_add_cuda.cu'],
-                          extra_compile_args={'cxx': ['-O3',] + version_dependent_macros,
+                          extra_compile_args={'cxx': ['-O3',] + version_dependent_macros + generator_flag,
                                              'nvcc':['-O3',
                                                      '-gencode', 'arch=compute_70,code=sm_70',
                                                      '-I./apex/contrib/csrc/multihead_attn/cutlass/',
@@ -260,7 +293,7 @@ if "--fast_multihead_attn" in sys.argv:
                                                      '-U__CUDA_NO_HALF_CONVERSIONS__',
                                                      '--expt-relaxed-constexpr',
                                                      '--expt-extended-lambda',
-                                                      '--use_fast_math'] + version_dependent_macros}))
+                                                      '--use_fast_math'] + version_dependent_macros + generator_flag}))

 setup(
    name='apex',

--- a/tests/L0/run_amp/test_larc.py
+++ b/tests/L0/run_amp/test_larc.py
+import unittest
+
+import torch
+from torch import nn
+from torch.nn import Parameter
+
+from apex import amp
+from apex.parallel.LARC import LARC
+from utils import common_init
+
+
+class MyModel(torch.nn.Module):
+    def __init__(self, unique):
+        super(MyModel, self).__init__()
+        self.weight0 = Parameter(
+            unique + torch.arange(2, device="cuda", dtype=torch.float32)
+        )
+
+    def forward(self, input):
+        return (input * self.weight0).sum()
+
+
+class TestLARC(unittest.TestCase):
+    def setUp(self):
+        self.x = torch.ones((2), device="cuda", dtype=torch.float32)
+        common_init(self)
+
+    def tearDown(self):
+        pass
+
+    def test_larc_mixed_precision(self):
+        for opt_level in ["O0", "O1", "O2", "O3"]:
+            model = MyModel(1)
+
+            optimizer = LARC(
+                torch.optim.SGD(
+                    [{"params": model.parameters(), "lr": 0.25}], momentum=0.125
+                )
+            )
+
+            model, optimizer = amp.initialize(
+                model, optimizer, opt_level=opt_level, verbosity=0
+            )
+
+            optimizer.zero_grad()
+            loss = model(self.x)
+            with amp.scale_loss(loss, optimizer) as scaled_loss:
+                scaled_loss.backward()
+            optimizer.step()
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/tests/L0/run_mlp/test_mlp.py
+++ b/tests/L0/run_mlp/test_mlp.py
+"""Tests for c++ MLP"""
+import unittest
+from time import time
+import numpy as np
+
+import torch
+from torch import nn
+
+from apex.mlp import MLP
+
+batch_size = 1024
+mlp_sizes = [480, 1024, 1024, 512, 256, 1]
+num_iters = 10
+
+class TestMLP(unittest.TestCase):
+
+    def test_creation(self):
+        MLP(mlp_sizes)
+
+    def test_numeric(self):
+        mlp = MLP(mlp_sizes).cuda()
+
+        mlp_layers = []
+        for i in range(mlp.num_layers):
+            linear = nn.Linear(mlp_sizes[i], mlp_sizes[i + 1])
+            mlp.weights[i].data.copy_(linear.weight)
+            mlp.biases[i].data.copy_(linear.bias)
+            mlp_layers.append(linear)
+            mlp_layers.append(nn.ReLU(inplace=True))
+
+        ref_mlp = nn.Sequential(*mlp_layers).cuda()
+
+        test_input = torch.empty(batch_size, mlp_sizes[0], device="cuda").uniform_(-1., 1.).requires_grad_()
+        ref_input = test_input.clone().detach().requires_grad_()
+        mlp_out = mlp(test_input)
+        ref_out = ref_mlp(ref_input)
+        np.testing.assert_allclose(
+            mlp_out.detach().cpu().numpy(),
+            ref_out.detach().cpu().numpy(),
+            atol=1e-7, rtol=1e-5)
+
+        # Use mean value as scalar loss. Multiply 10 to make it big enough not zero out
+        mlp_out.mean().mul(10.).backward()
+        ref_out.mean().mul(10.).backward()
+        np.testing.assert_allclose(
+            test_input.grad.detach().cpu().numpy(),
+            ref_input.grad.detach().cpu().numpy(),
+            atol=0, rtol=1e-5)
+        np.testing.assert_allclose(
+            mlp.biases[0].grad.detach().cpu().numpy(),
+            ref_mlp[0].bias.grad.detach().cpu().numpy(),
+            atol=1e-7, rtol=1e-5)
+
+    def test_no_bias(self):
+        for use_activation in ['none', 'relu', 'sigmoid']:
+            mlp = MLP(mlp_sizes, bias=False, activation=use_activation).cuda()
+
+            mlp_layers = []
+            for i in range(mlp.num_layers):
+                linear = nn.Linear(mlp_sizes[i], mlp_sizes[i + 1], bias=False)
+                mlp.weights[i].data.copy_(linear.weight)
+                mlp_layers.append(linear)
+                if use_activation == 'relu':
+                    mlp_layers.append(nn.ReLU(inplace=True))
+                if use_activation == 'sigmoid':
+                    mlp_layers.append(nn.Sigmoid())
+
+            ref_mlp = nn.Sequential(*mlp_layers).cuda()
+
+            test_input = torch.empty(batch_size, mlp_sizes[0], device="cuda").uniform_(-1., 1.).requires_grad_()
+            ref_input = test_input.clone().detach().requires_grad_()
+            mlp_out = mlp(test_input)
+            ref_out = ref_mlp(ref_input)
+            np.testing.assert_allclose(
+                mlp_out.detach().cpu().numpy(),
+                ref_out.detach().cpu().numpy(),
+                atol=1e-7, rtol=1e-5)
+
+            # Use mean value as scalar loss. Multiply 10 to make it big enough not zero out
+            mlp_out.mean().mul(10.).backward()
+            ref_out.mean().mul(10.).backward()
+            np.testing.assert_allclose(
+                test_input.grad.detach().cpu().numpy(),
+                ref_input.grad.detach().cpu().numpy(),
+                atol=0, rtol=100)
+            np.testing.assert_allclose(
+                mlp.weights[0].grad.detach().cpu().numpy(),
+                ref_mlp[0].weight.grad.detach().cpu().numpy(),
+                atol=1e-7, rtol=100)
+
+    def test_with_bias(self):
+        for use_activation in ['none', 'relu', 'sigmoid']:
+            mlp = MLP(mlp_sizes, bias=True, activation=use_activation).cuda()
+
+            mlp_layers = []
+            for i in range(mlp.num_layers):
+                linear = nn.Linear(mlp_sizes[i], mlp_sizes[i + 1], bias=True)
+                mlp.weights[i].data.copy_(linear.weight)
+                mlp.biases[i].data.copy_(linear.bias)
+                mlp_layers.append(linear)
+                if use_activation == 'relu':
+                    mlp_layers.append(nn.ReLU(inplace=True))
+                if use_activation == 'sigmoid':
+                    mlp_layers.append(nn.Sigmoid())
+
+            ref_mlp = nn.Sequential(*mlp_layers).cuda()
+
+            test_input = torch.empty(batch_size, mlp_sizes[0], device="cuda").uniform_(-1., 1.).requires_grad_()
+            ref_input = test_input.clone().detach().requires_grad_()
+            mlp_out = mlp(test_input)
+            ref_out = ref_mlp(ref_input)
+            np.testing.assert_allclose(
+                mlp_out.detach().cpu().numpy(),
+                ref_out.detach().cpu().numpy(),
+                atol=1e-7, rtol=1e-5)
+
+            # Use mean value as scalar loss. Multiply 10 to make it big enough not zero out
+            mlp_out.mean().mul(10.).backward()
+            ref_out.mean().mul(10.).backward()
+            np.testing.assert_allclose(
+                test_input.grad.detach().cpu().numpy(),
+                ref_input.grad.detach().cpu().numpy(),
+                atol=0, rtol=1)
+            np.testing.assert_allclose(
+                mlp.weights[0].grad.detach().cpu().numpy(),
+                ref_mlp[0].weight.grad.detach().cpu().numpy(),
+                atol=1e-7, rtol=1)
+            np.testing.assert_allclose(
+                mlp.biases[0].grad.detach().cpu().numpy(),
+                ref_mlp[0].bias.grad.detach().cpu().numpy(),
+                atol=1e-7, rtol=1e-5)
+
+    def test_no_grad(self):
+        mlp = MLP(mlp_sizes).cuda()
+
+        mlp_layers = []
+        for i in range(mlp.num_layers):
+            linear = nn.Linear(mlp_sizes[i], mlp_sizes[i + 1])
+            mlp.weights[i].data.copy_(linear.weight)
+            mlp.biases[i].data.copy_(linear.bias)
+            mlp_layers.append(linear)
+            mlp_layers.append(nn.ReLU(inplace=True))
+
+        ref_mlp = nn.Sequential(*mlp_layers).cuda()
+
+        test_input = torch.empty(batch_size, mlp_sizes[0], device="cuda").uniform_(-1., 1.)
+        ref_input = test_input.clone().detach()
+        mlp_out = mlp(test_input)
+        ref_out = ref_mlp(ref_input)
+        np.testing.assert_allclose(
+            mlp_out.detach().cpu().numpy(),
+            ref_out.detach().cpu().numpy(),
+            atol=1e-7, rtol=1e-5)
+
+        # Use mean value as scalar loss. Multiply 10 to make it big enough not zero out
+        mlp_out.mean().mul(10.).backward()
+        ref_out.mean().mul(10.).backward()
+        np.testing.assert_allclose(
+            mlp.weights[0].grad.detach().cpu().numpy(),
+            ref_mlp[0].weight.grad.detach().cpu().numpy(),
+            atol=1e-7, rtol=1e-5)
+
+
+    def test_performance_half(self):
+        mlp = MLP(mlp_sizes).cuda().half()
+
+        mlp_layers = []
+        for i in range(mlp.num_layers):
+            linear = nn.Linear(mlp_sizes[i], mlp_sizes[i + 1])
+            mlp.weights[i].data.copy_(linear.weight)
+            mlp.biases[i].data.copy_(linear.bias)
+            mlp_layers.append(linear)
+            mlp_layers.append(nn.ReLU(inplace=True))
+
+        ref_mlp = nn.Sequential(*mlp_layers).cuda().half()
+
+        test_input = torch.empty(
+            batch_size, mlp_sizes[0], device="cuda", dtype=torch.half).fill_(10.).requires_grad_()
+        ref_input = torch.empty(
+            batch_size, mlp_sizes[0], device="cuda", dtype=torch.half).fill_(10.).requires_grad_()
+
+        # Warm up GPU
+        for _ in range(100):
+            ref_out = ref_mlp(ref_input)
+            ref_loss = ref_out.mean()
+            ref_mlp.zero_grad()
+            ref_loss.backward()
+            mlp_out = mlp(test_input)
+            test_loss = mlp_out.mean()
+            mlp.zero_grad()
+            test_loss.backward()
+
+        torch.cuda.profiler.start()
+        torch.cuda.synchronize()
+        start_time = time()
+        for _ in range(num_iters):
+            ref_out = ref_mlp(ref_input)
+            ref_loss = ref_out.mean()
+            ref_mlp.zero_grad()
+            ref_loss.backward()
+        torch.cuda.synchronize()
+        stop_time = time()
+        print(F"\nPytorch MLP time {(stop_time - start_time) * 1000. / num_iters:.4f} ms")
+
+        torch.cuda.synchronize()
+        start_time = time()
+        for _ in range(num_iters):
+            mlp_out = mlp(test_input)
+            test_loss = mlp_out.mean()
+            mlp.zero_grad()
+            test_loss.backward()
+        torch.cuda.synchronize()
+        stop_time = time()
+        print(F"C++ MLP time {(stop_time - start_time) * 1000. / num_iters:.4f} ms")
+        torch.cuda.profiler.stop()
+
+if __name__ == '__main__':
+    unittest.main()
--- a/tests/L0/run_pyprof_data/__init__.py
+++ b/tests/L0/run_pyprof_data/__init__.py
--- a/tests/L0/run_pyprof_data/test_pyprof_data.py
+++ b/tests/L0/run_pyprof_data/test_pyprof_data.py
+import inspect
+import unittest
+
+from apex.pyprof.prof.data import Data
+from apex.pyprof.prof.prof import foo
+
+
+class TestPyProfData(unittest.TestCase):
+
+	def __init__(self, testName):
+		super().__init__(testName)
+
+	def setUp(self):
+		pass
+
+	def tearDown(self):
+		pass
+
+	def test_data(self):
+		kernels = [
+			{'kShortName': 'elementwise_kernel', 'kDuration': 2848, 'layer': [], 'trace': [], 'reprMarkers': [], 'marker': ["{'mod': 'Tensor', 'op': 'float', 'args': [{'name': '', 'type': 'tensor', 'shape': (18, 104, 160), 'dtype': 'bool'}]}"], 'seqMarker': ['to, seq = 60471'], 'seqId': [60471], 'subSeqId': 0, 'altSeqId': [], 'dir': 'fprop', 'mod': ['Tensor'], 'op': ['float'], 'tid': 1431533376, 'device': 0, 'stream': 7, 'grid': (585, 1, 1), 'block': (512, 1, 1), 'kLongName': 'void at::native::elementwise_kernel<512, 1, void at::native::gpu_kernel_impl<void at::native::copy_kernel_impl<float, bool>(at::TensorIterator&)::{lambda(bool)#1}>(at::TensorIterator&, void at::native::copy_kernel_impl<float, bool>(at::TensorIterator&)::{lambda(bool)#1} const&)::{lambda(int)#1}>(int, void at::native::gpu_kernel_impl<void at::native::copy_kernel_impl<float, bool>(at::TensorIterator&)::{lambda(bool)#1}>(at::TensorIterator&, void at::native::copy_kernel_impl<float, bool>(at::TensorIterator&)::{lambda(bool)#1} const&)::{lambda(int)#1})'},
+			{'kShortName': 'elementwise_kernel', 'kDuration': 201182, 'layer': [], 'trace': [], 'reprMarkers': [], 'marker': ["{'mod': 'Tensor', 'op': 'clone', 'args': [{'name': '', 'type': 'tensor', 'shape': (18, 4, 416, 640), 'dtype': 'float32'}]}"], 'seqMarker': ['clone, seq = 60161'], 'seqId': [60161], 'subSeqId': 0, 'altSeqId': [], 'dir': 'fprop', 'mod': ['Tensor'], 'op': ['clone'], 'tid': 1431533376, 'device': 0, 'stream': 7, 'grid': (37440, 1, 1), 'block': (128, 1, 1), 'kLongName': 'void at::native::elementwise_kernel<128, 4, void at::native::gpu_kernel_impl<void at::native::copy_kernel_impl<float, float>(at::TensorIterator&)::{lambda(float)#1}>(at::TensorIterator&, void at::native::copy_kernel_impl<float, float>(at::TensorIterator&)::{lambda(float)#1} const&)::{lambda(int)#2}>(int, void at::native::gpu_kernel_impl<void at::native::copy_kernel_impl<float, float>(at::TensorIterator&)::{lambda(float)#1}>(at::TensorIterator&, void at::native::copy_kernel_impl<float, float>(at::TensorIterator&)::{lambda(float)#1} const&)::{lambda(int)#2})'},
+		]
+
+		for k in kernels:
+			d = Data(k)
+			mod = k['mod']
+			op = k['op']
+			xx = foo(mod, op, d)
+			d.setParams(xx.params())
+
+
+def run_tests(test_name):
+	dummy = TestPyProfData(test_name)
+	test_cases = list(filter(lambda x: 'test_' in x, map(lambda x: x[0], inspect.getmembers(dummy, predicate=inspect.ismethod))))
+	print(f'Running tests for {test_name}')
+	suite = unittest.TestSuite()
+	for test_case in test_cases:
+		suite.addTest(TestPyProfData(test_case))
+	unittest.TextTestRunner().run(suite)
+
+if __name__ == '__main__':
+	run_tests('test_data')
--- a/tests/L0/run_test.py
+++ b/tests/L0/run_test.py
 import unittest
 import sys

-test_dirs = ["run_amp", "run_fp16util", "run_optimizers", "run_fused_layer_norm", "run_pyprof_nvtx"]
+test_dirs = ["run_amp", "run_fp16util", "run_optimizers", "run_fused_layer_norm", "run_pyprof_nvtx", "run_pyprof_data", "run_mlp"]

 runner = unittest.TextTestRunner(verbosity=2)