Macros based on torch.__version__ to compile with 0.4 and 0.5

Macros based on torch.version to compile with 0.4 and 0.5
d506eff2 · Michael Carilli · 61b452e8 · d506eff2 · d506eff2 · d506eff2
Commit d506eff2 authored Jun 06, 2018 by Michael Carilli
12 changed files
--- a/README.md
+++ b/README.md
@@ -5,8 +5,8 @@ This repo is designed to hold PyTorch modules and utilities that are under activ
 # Requirements
 Python 3
-PyTorch 0.3 or newer
 CUDA 9
+PyTorch 0.4 or newer.  We recommend to use the latest stable release, obtainable from https://pytorch.org/.  We also test against the latest master branch, obtainable from https://github.com/pytorch/pytorch.  If you have any problems building, please file an issue.
 # [Full Documentation](https://nvidia.github.io/apex)
@@ -23,7 +23,7 @@ import apex
 ```
 and optionally (if required for your use)
 ```
-import apex._C as apex_backend
+import apex_C as apex_backend
 ```
 # What's included

--- a/apex/amp/handle.py
+++ b/apex/amp/handle.py
@@ -4,7 +4,7 @@ import warnings
 import torch
-from apex._C import scale_check_overflow
+from apex_C import scale_check_overflow
 class AmpHandle(object):
    def __init__(self, enable_caching=True):

--- a/apex/fp16_utils/fused_weight_norm.py
+++ b/apex/fp16_utils/fused_weight_norm.py
 import torch
 from torch.autograd import Variable
 from torch.autograd.function import Function, once_differentiable
-import apex._C
+import apex_C
 def check_contig_cuda(tensors, names):
    for tensor, name in zip(tensors, names):
@@ -71,7 +71,7 @@ class Fused_Weight_Norm(Function):
        [output_size(0),1,1,...].
        """
-        apex._C.weight_norm_fwd(output, norms, input, g, dim)
+        apex_C.weight_norm_fwd(output, norms, input, g, dim)
        ctx.save_for_backward(input, g)
        # save_for_backward can only save input or output tensors,
@@ -102,7 +102,7 @@ class Fused_Weight_Norm(Function):
        grad_input = grad_output_contig.new(grad_output.size()).contiguous()
        grad_g = savedg.new(savedg.size()).contiguous()
-        apex._C.weight_norm_bwd(grad_input, 
+        apex_C.weight_norm_bwd(grad_input, 
                                grad_g,
                                grad_output_contig, 
                                savedInput, 

--- a/csrc/interface.cpp
+++ b/csrc/interface.cpp
@@ -4,7 +4,7 @@
 // here, but I can't make nvcc play well with torch.h.  For now, use a layer of indirection 
 // and separate .cu implementation files.
-// If we want everything to be part of "apex._C", we need all the interface functions defined 
+// If we want everything to be part of "apex_C", we need all the interface functions defined 
 // in this file, or linker will complain about "multiple definitions of PyInit".
 // TODO:  multiple modules?
@@ -54,15 +54,23 @@ void scale_check_overflow_cuda
   float scale,
   const at::Tensor& d_buf);
+#ifdef VERSION_LE_04
+#define VERSION_AGNOSTIC_CHECK AT_ASSERT
+#else
+#define VERSION_AGNOSTIC_CHECK AT_CHECK
+#endif
 void scale_check_overflow
  (at::Tensor grads,
   float scale,
   at::Tensor overflow_buf)
 { 
-  AT_CHECK(grads.type().is_cuda(), "x must be a CUDA tensor");
+  VERSION_AGNOSTIC_CHECK
-  AT_CHECK(overflow_buf.type().is_cuda(), "y must be a CUDA tensor");
+    (grads.type().is_cuda(), "x must be a CUDA tensor");
+  VERSION_AGNOSTIC_CHECK
+    (overflow_buf.type().is_cuda(), "y must be a CUDA tensor");
  // Make sure we are downscaling the FP32 master grads
-  AT_CHECK
+  VERSION_AGNOSTIC_CHECK
    (grads.type().scalarType() == at::ScalarType::Float, 
     "grads supplied to scale_check_overflow should be fp32 (master grads).")
  scale_check_overflow_cuda(grads, scale, overflow_buf);

--- a/csrc/kernel_utils.cuh
+++ b/csrc/kernel_utils.cuh
@@ -13,6 +13,19 @@
 #define __SYNCWARP 
 #endif
+#ifdef VERSION_LE_04                                                        
+#define USING_ACCSCALAR_T using accscalar_t = cuda::acc_type<cuda_scalar_t>;
+#else                                                                        
+#define USING_ACCSCALAR_T using accscalar_t = acc_type<cuda_scalar_t, true>; 
+#endif                                                                       
+#ifdef VERSION_LE_04                                    
+#define REDUCE_ADD ReduceAdd<accscalar_t, accscalar_t>()
+#else                                                   
+#define REDUCE_ADD ReduceAdd<accscalar_t>()             
+#endif                                                  
 // Block size for weight_norm_*_first_dim_kernel.
 // Currently, kernels are non-persistent.
 // Dialing up the block size to, say 1024, can improve performance by

--- a/csrc/scale_cuda.cu
+++ b/csrc/scale_cuda.cu
 #include <ATen/ATen.h>
-#include "ATen/AccumulateType.h"
+// #include "ATen/AccumulateType.h"
 #include "ATen/cuda/CUDATensorMethods.cuh"
 #include "ATen/cuda/CUDATypeConversion.cuh"
 #include <THC/THCTensorMathReduce.cuh>

--- a/csrc/weight_norm_bwd_cuda.cu
+++ b/csrc/weight_norm_bwd_cuda.cu
 #include "kernel_utils.cuh"
 #include <ATen/ATen.h>
+#ifdef VERSION_LE_04
+#include "ATen/cuda/AccumulateType.cuh"
+#else
 #include "ATen/AccumulateType.h"
+#endif
 #include "ATen/cuda/CUDATensorMethods.cuh"
 #include "ATen/cuda/CUDATypeConversion.cuh"
 #include <THC/THCTensorMathReduce.cuh>
@@ -40,7 +46,7 @@ __global__ void weight_norm_bwd_first_dim_kernel
    thread_sum += pLpwi*savedvi; // AccumOp, could do Kahan here
  }
-  reduce_block_into_lanes(s, thread_sum, 1, ReduceAdd<accscalar_t>());
+  reduce_block_into_lanes(s, thread_sum, 1, REDUCE_ADD);
  accscalar_t result = s[0];
  // Could choose to save reciprocal of norm instead I suppose, but norms is probably
@@ -99,7 +105,7 @@ __global__ void weight_norm_bwd_last_dim_kernel
      slower_dims_location += blockDim.y; 
    }
-  reduce_block_into_lanes(s, thread_sum, blockDim.x, ReduceAdd<accscalar_t>()); 
+  reduce_block_into_lanes(s, thread_sum, blockDim.x, REDUCE_ADD); 
  accscalar_t result = s[threadIdx.x];
  // Broadcast load; could use shared memory instead.
@@ -159,7 +165,7 @@ void weight_norm_bwd_cuda
       [&]
       {
         using cuda_scalar_t = cuda::type<scalar_t>;
-         using accscalar_t = acc_type<cuda_scalar_t, true>;
+         USING_ACCSCALAR_T
 	 weight_norm_bwd_first_dim_kernel
 	   <<<pLpw.size(0), 
@@ -192,7 +198,7 @@ void weight_norm_bwd_cuda
       [&]
       {
         using cuda_scalar_t = cuda::type<scalar_t>;
-         using accscalar_t = acc_type<cuda_scalar_t, true>;
+         USING_ACCSCALAR_T
         weight_norm_bwd_last_dim_kernel
           <<<(fast_dim_size+TILE_W-1)/TILE_W,

--- a/csrc/weight_norm_fwd_cuda.cu
+++ b/csrc/weight_norm_fwd_cuda.cu
 #include "kernel_utils.cuh"
 #include <ATen/ATen.h>
+#ifdef VERSION_LE_04
+#include "ATen/cuda/AccumulateType.cuh"
+#else
 #include "ATen/AccumulateType.h"
+#endif
 #include "ATen/cuda/CUDATensorMethods.cuh"
 #include "ATen/cuda/CUDATypeConversion.cuh"
 #include <THC/THCTensorMathReduce.cuh>
@@ -38,7 +44,7 @@ __global__ void weight_norm_fwd_first_dim_kernel
    thread_sum += val_f*val_f; // AccumOp, could do Kahan here
  }
-  reduce_block_into_lanes(s, thread_sum, 1, ReduceAdd<accscalar_t>());
+  reduce_block_into_lanes(s, thread_sum, 1, REDUCE_ADD);
  accscalar_t result = s[0];
  result = sqrtf(result);
@@ -92,7 +98,7 @@ __global__ void weight_norm_fwd_last_dim_kernel
      slower_dims_location += blockDim.y; 
    }
-  reduce_block_into_lanes(s, thread_sum, blockDim.x, ReduceAdd<accscalar_t>()); 
+  reduce_block_into_lanes(s, thread_sum, blockDim.x, REDUCE_ADD); 
  // Better to pass an EpilogueOp to reduce_block_into_lanes, implement later
  if(threadIdx.y == 0)
@@ -150,7 +156,7 @@ void weight_norm_fwd_cuda
       [&]
       {
         using cuda_scalar_t = cuda::type<scalar_t>;
-         using accscalar_t = acc_type<cuda_scalar_t, true>;
+         USING_ACCSCALAR_T
         weight_norm_fwd_first_dim_kernel
           <<<v.size(0), 
@@ -181,7 +187,7 @@ void weight_norm_fwd_cuda
       [&]
       {
         using cuda_scalar_t = cuda::type<scalar_t>;
-         using accscalar_t = acc_type<cuda_scalar_t, true>;
+         USING_ACCSCALAR_T
         // just trying this formatting out to see how it feels... 
         weight_norm_fwd_last_dim_kernel

--- a/setup.py
+++ b/setup.py
@@ -14,6 +14,18 @@ if not torch.cuda.is_available():
    print("Warning: Torch did not find available GPUs on this system.\n",
          "If your intention is to cross-compile, this is not an error.")
+print("torch.__version__  = ", torch.__version__)
+TORCH_MAJOR = int(torch.__version__.split('.')[0])
+TORCH_MINOR = int(torch.__version__.split('.')[1])
+if TORCH_MAJOR == 0 and TORCH_MINOR < 4:
+      raise RuntimeError("APEx requires Pytorch 0.4 or newer.\n" +
+                         "The latest stable release can be obtained from https://pytorch.org/")
+version_le_04 = []
+if TORCH_MAJOR == 0 and TORCH_MINOR == 4:
+    version_le_04 = ['-DVERSION_LE_04']
 def find(path, regex_func, collect=False):
    collection = [] if collect else None
    for root, dirs, files in os.walk(path):
@@ -35,37 +47,37 @@ def get_cuda_version():
    CUDA_LIB = re.compile(', V[0-9]+\.[0-9]+\.[0-9]+').search(nvcc_output).group(0).split('V')[1]
    print("Found CUDA_LIB = ", CUDA_LIB)
-    CUDA_MAJOR_VERSION = int(CUDA_LIB.split('.')[0])
+    CUDA_MAJOR = int(CUDA_LIB.split('.')[0])
-    print("Found CUDA_MAJOR_VERSION = ", CUDA_MAJOR_VERSION)
+    print("Found CUDA_MAJOR = ", CUDA_MAJOR)
-    if CUDA_MAJOR_VERSION < 8:
+    if CUDA_MAJOR < 8:
        raise RuntimeError("APex requires CUDA 8.0 or newer")
-    return CUDA_MAJOR_VERSION
+    return CUDA_MAJOR
 if CUDA_HOME is not None:
    print("Found CUDA_HOME = ", CUDA_HOME)
-    CUDA_MAJOR_VERSION = get_cuda_version()
+    CUDA_MAJOR = get_cuda_version()
    gencodes = ['-gencode', 'arch=compute_52,code=sm_52',
                '-gencode', 'arch=compute_60,code=sm_60',
                '-gencode', 'arch=compute_61,code=sm_61',]
-    if CUDA_MAJOR_VERSION > 8:
+    if CUDA_MAJOR > 8:
        gencodes += ['-gencode', 'arch=compute_70,code=sm_70',
                     '-gencode', 'arch=compute_70,code=compute_70',]
    ext_modules = []
    extension = CUDAExtension(
-        'apex._C', [
+        'apex_C', [
            'csrc/interface.cpp',
            'csrc/weight_norm_fwd_cuda.cu',
            'csrc/weight_norm_bwd_cuda.cu',
            'csrc/scale_cuda.cu',
        ],
-        extra_compile_args={'cxx': ['-g'],
+        extra_compile_args={'cxx': ['-g'] + version_le_04,
-                            'nvcc': ['-O3'] + gencodes})
+                            'nvcc': ['-O3'] + version_le_04 + gencodes})
    ext_modules.append(extension)
 else:
    raise RuntimeError("Could not find Cuda install directory")

--- a/tests/raw_ops/compare.py
+++ b/tests/raw_ops/compare.py
@@ -3,9 +3,9 @@ import numpy as np
 def compare(cuda_out, pt_out, pt_out_control, rows):
-    print(                    "Pytorch ops in fp16:  ", pt_out        )
+    # print(                    "Pytorch ops in fp16:  ", pt_out        )
-    print(                          "Kernel result:  ", cuda_out      )
+    # print(                          "Kernel result:  ", cuda_out      )
-    print("Control (Pytorch ops, sticking to fp32):  ", pt_out_control)
+    # print("Control (Pytorch ops, sticking to fp32):  ", pt_out_control)
    # Make upconverted copies for error check against fp32 control
    cuda_out_fp32 = cuda_out.float()
@@ -22,21 +22,21 @@ def compare(cuda_out, pt_out, pt_out_control, rows):
    pt_maxdiffs, pt_maxdiff_locs     = torch.max((pt_out_control - pt_out_fp32  ).abs(),1)
    print(    "cuda_maxdiffs = ", cuda_maxdiffs    )
-    print("cuda_maxdiff_locs = ", cuda_maxdiff_locs)
+    # print("cuda_maxdiff_locs = ", cuda_maxdiff_locs)
    print(      "pt_maxdiffs = ", pt_maxdiffs      )
-    print(  "pt_maxdiff_locs = ", pt_maxdiff_locs  )
+    # print(  "pt_maxdiff_locs = ", pt_maxdiff_locs  )
    row_indices = torch.LongTensor(np.arange(rows))
-    print("cuda_out at cuda_maxdiff_locs in each row:")
+    # print("cuda_out at cuda_maxdiff_locs in each row:")
-    # bizarrely, this will work if you do it at the python prompt:
+    # # bizarrely, this will work if you do it at the python prompt:
-    # print(cuda_out[row_indices,cuda_maxdiff_locs])
+    # # print(cuda_out[row_indices,cuda_maxdiff_locs])
-    # ...but it only seems to work here if you wrap with numpy arrays:
+    # # ...but it only seems to work here if you wrap with numpy arrays:
-    print(      cuda_out[np.array(row_indices),np.array(cuda_maxdiff_locs)])
+    # print(      cuda_out[np.array(row_indices),np.array(cuda_maxdiff_locs)])
-    print("pt_out_control at cuda_maxdiff_locs in each row:")
+    # print("pt_out_control at cuda_maxdiff_locs in each row:")
-    print(pt_out_control[np.array(row_indices),np.array(cuda_maxdiff_locs)])
+    # print(pt_out_control[np.array(row_indices),np.array(cuda_maxdiff_locs)])
+    # 
-    print("pt_out at pt_maxdiff_locs in each row:"          )
+    # print("pt_out at pt_maxdiff_locs in each row:"          )
-    print(        pt_out[np.array(row_indices),np.array(pt_maxdiff_locs)])
+    # print(        pt_out[np.array(row_indices),np.array(pt_maxdiff_locs)])
-    print("pt_out_control at pt_maxdiff_locs in each row:"  )
+    # print("pt_out_control at pt_maxdiff_locs in each row:"  )
-    print(pt_out_control[np.array(row_indices),np.array(pt_maxdiff_locs)])
+    # print(pt_out_control[np.array(row_indices),np.array(pt_maxdiff_locs)])
--- a/tests/raw_ops/test_backward.py
+++ b/tests/raw_ops/test_backward.py
 import torch
 from torch.autograd import Variable
-import apex._C
+import apex_C
 import numpy as np
 from compare import compare
 from norm import pt_norm, get_norm_shape
@@ -88,7 +88,7 @@ for rows, cols, fast in sizes:
        pLpg_cuda     = pLpg_cuda    .half()
    torch.cuda.nvtx.range_push("kernel weight norm backward")
-    apex._C.weight_norm_bwd(pLpInput_cuda, 
+    apex_C.weight_norm_bwd(pLpInput_cuda, 
                            pLpg_cuda, 
                            pLpOutput_fp16,
                            pt_input_fp16, 

--- a/tests/raw_ops/test_forward.py
+++ b/tests/raw_ops/test_forward.py
 import torch
 import sys
-import apex._C
+import apex_C
 import numpy as np
 from compare import compare
 from norm import pt_norm, get_norm_shape
@@ -60,7 +60,7 @@ for rows, cols, fast in sizes:
        g        =        g.half()
        cuda_out = cuda_out.half()
-    apex._C.weight_norm_fwd(cuda_out, cuda_norms, pt_in, g, dim)
+    apex_C.weight_norm_fwd(cuda_out, cuda_norms, pt_in, g, dim)
    torch.cuda.synchronize()
    # quit()