Compilation succeeds on 0.4, 18.04-6 containers, and current upstream master

fb075b86 · Michael Carilli · bf855389 · fb075b86 · fb075b86 · fb075b86
Commit fb075b86 authored Jun 08, 2018 by Michael Carilli
5 changed files
--- a/csrc/CUDATypeConversion.cuh
+++ b/csrc/CUDATypeConversion.cuh
+#pragma once
+
+#include <ATen/Half.h>
+#include <ATen/cuda/CUDAHalf.cuh>
+
+// Type traits to convert types to CUDA-specific types. Used primarily to
+// convert at::Half to CUDA's half type. This makes the conversion explicit.
+
+// Disambiguate from whatever is in aten
+namespace apex { namespace cuda {
+template <typename T>
+struct TypeConversion {
+  using type = T;
+};
+
+template <>
+struct TypeConversion<at::Half> {
+  using type = half;
+};
+
+template <typename T>
+using type = typename TypeConversion<T>::type;
+}} // namespace apex::cuda
--- a/csrc/kernel_utils.cuh
+++ b/csrc/kernel_utils.cuh
 #include <cuda.h>
 #include <cuda_runtime.h>

+// Lock in a local version of CUDATypeConversion.cuh
+#include "CUDATypeConversion.cuh"
+
+#include <THC/THCNumerics.cuh>
+
 #if __CUDACC_VER_MAJOR__ >= 9
 #define __SHFL_DOWN(var, delta)  __shfl_down_sync(0xffffffff, var, delta)
 #else
@@ -13,19 +18,13 @@
 #define __SYNCWARP 
 #endif

+// not a long term solution, need to get this code into upstream.
 #ifdef VERSION_LE_04                                                        
 #define USING_ACCSCALAR_T using accscalar_t = cuda::acc_type<cuda_scalar_t>;
 #else                                                                        
 #define USING_ACCSCALAR_T using accscalar_t = acc_type<cuda_scalar_t, true>; 
 #endif                                                                       

-#ifdef VERSION_LE_04                                    
-#define REDUCE_ADD ReduceAdd<accscalar_t, accscalar_t>()
-#else                                                   
-#define REDUCE_ADD ReduceAdd<accscalar_t>()             
-#endif                                                  
-
-
 // Block size for weight_norm_*_first_dim_kernel.
 // Currently, kernels are non-persistent.
 // Dialing up the block size to, say 1024, can improve performance by
@@ -44,13 +43,13 @@
 // blocks across the slow dimension up to the hardware-max block size of 1024.
 #define TILE_H 64

-// For reference, in THCTensorMathReduce.cuh:
-// template <typename T>
-// struct ReduceAdd {
-//   inline __device__ T operator()(const T a, const T b) const {
-//     return THCNumerics<T>::add(a, b);
-//   }
-// };
+// Lock in a local version of ReduceAdd, copied from THCTensorMathReduce.cuh:
+template <typename T>
+struct ReduceAdd {
+  inline __device__ T operator()(const T a, const T b) const {
+    return THCNumerics<T>::add(a, b);
+  }
+};

 // lanes is intended to be <= 32.
 template 

--- a/csrc/scale_cuda.cu
+++ b/csrc/scale_cuda.cu
@@ -2,7 +2,8 @@
 // #include "ATen/AccumulateType.h"
 #include "ATen/cuda/CUDATensorMethods.cuh"
 #include "ATen/cuda/CUDATypeConversion.cuh"
-#include <THC/THCTensorMathReduce.cuh>
+// #include <THC/THCTensorMathReduce.cuh>
+#include <THC/THCGeneral.h>

 #include <assert.h>


--- a/csrc/weight_norm_bwd_cuda.cu
+++ b/csrc/weight_norm_bwd_cuda.cu
@@ -10,7 +10,7 @@

 #include "ATen/cuda/CUDATensorMethods.cuh"
 #include "ATen/cuda/CUDATypeConversion.cuh"
-#include <THC/THCTensorMathReduce.cuh>
+// #include <THC/THCTensorMathReduce.cuh>

 template
  <typename scalar_t, 
@@ -46,7 +46,7 @@ __global__ void weight_norm_bwd_first_dim_kernel
    thread_sum += pLpwi*savedvi; // AccumOp, could do Kahan here
  }

-  reduce_block_into_lanes(s, thread_sum, 1, REDUCE_ADD);
+  reduce_block_into_lanes(s, thread_sum, 1, ReduceAdd<accscalar_t>());
  accscalar_t result = s[0];

  // Could choose to save reciprocal of norm instead I suppose, but norms is probably
@@ -105,7 +105,7 @@ __global__ void weight_norm_bwd_last_dim_kernel
      slower_dims_location += blockDim.y; 
    }

-  reduce_block_into_lanes(s, thread_sum, blockDim.x, REDUCE_ADD); 
+  reduce_block_into_lanes(s, thread_sum, blockDim.x, ReduceAdd<accscalar_t>()); 
  accscalar_t result = s[threadIdx.x];

  // Broadcast load; could use shared memory instead.
@@ -145,7 +145,7 @@ void weight_norm_bwd_cuda
 {
 #ifdef DEBUG_ANY
  using namespace std;
-  cout << "Hello from send_to_bwd with pLpw.type = " << pLpw.type << endl;
+  cout << "Hello from send_to_bwd with pLpw.type() = " << pLpw.type() << endl;
 #endif

  const int ndims = savedv.ndimension();
@@ -164,7 +164,7 @@ void weight_norm_bwd_cuda
       "weight_norm_bwd_first_dim_kernel",  
       [&]
       {
-         using cuda_scalar_t = cuda::type<scalar_t>;
+         using cuda_scalar_t = apex::cuda::type<scalar_t>;
         USING_ACCSCALAR_T

 	 weight_norm_bwd_first_dim_kernel
@@ -197,7 +197,7 @@ void weight_norm_bwd_cuda
       "weight_norm_bwd_last_dim_kernel",  
       [&]
       {
-         using cuda_scalar_t = cuda::type<scalar_t>;
+         using cuda_scalar_t = apex::cuda::type<scalar_t>;
         USING_ACCSCALAR_T

         weight_norm_bwd_last_dim_kernel

--- a/csrc/weight_norm_fwd_cuda.cu
+++ b/csrc/weight_norm_fwd_cuda.cu
@@ -10,7 +10,7 @@

 #include "ATen/cuda/CUDATensorMethods.cuh"
 #include "ATen/cuda/CUDATypeConversion.cuh"
-#include <THC/THCTensorMathReduce.cuh>
+// #include <THC/THCTensorMathReduce.cuh>

 template
  <typename scalar_t, 
@@ -44,7 +44,7 @@ __global__ void weight_norm_fwd_first_dim_kernel
    thread_sum += val_f*val_f; // AccumOp, could do Kahan here
  }

-  reduce_block_into_lanes(s, thread_sum, 1, REDUCE_ADD);
+  reduce_block_into_lanes(s, thread_sum, 1, ReduceAdd<accscalar_t>());
  accscalar_t result = s[0];

  result = sqrtf(result);
@@ -98,7 +98,7 @@ __global__ void weight_norm_fwd_last_dim_kernel
      slower_dims_location += blockDim.y; 
    }

-  reduce_block_into_lanes(s, thread_sum, blockDim.x, REDUCE_ADD); 
+  reduce_block_into_lanes(s, thread_sum, blockDim.x, ReduceAdd<accscalar_t>()); 

  // Better to pass an EpilogueOp to reduce_block_into_lanes, implement later
  if(threadIdx.y == 0)
@@ -136,7 +136,7 @@ void weight_norm_fwd_cuda
 {
 #ifdef DEBUG_ANY
  using namespace std;
-  cout << "hello from send_to_fwd with v.type = " << v.type << endl;
+  cout << "hello from send_to_fwd with v.type() = " << v.type() << endl;
 #endif

  const int ndims = v.ndimension();
@@ -155,7 +155,7 @@ void weight_norm_fwd_cuda
       "weight_norm_fwd_first_dim_kernel",  
       [&]
       {
-         using cuda_scalar_t = cuda::type<scalar_t>;
+         using cuda_scalar_t = apex::cuda::type<scalar_t>;
         USING_ACCSCALAR_T

         weight_norm_fwd_first_dim_kernel
@@ -186,7 +186,7 @@ void weight_norm_fwd_cuda
       "weight_norm_fwd_last_dim_kernel",  
       [&]
       {
-         using cuda_scalar_t = cuda::type<scalar_t>;
+         using cuda_scalar_t = apex::cuda::type<scalar_t>;
         USING_ACCSCALAR_T
        
         // just trying this formatting out to see how it feels...