Switch to simple Python-only install, in preparation for upstreaming C++ backend.

d695b68b · Michael Carilli · aa817132 · d695b68b · d695b68b · aa817132
Commit d695b68b authored Jul 23, 2018 by Michael Carilli
14 changed files
--- a/apex/amp/scaler.py
+++ b/apex/amp/scaler.py
 import torch

-from apex_C import scale_check_overflow
+# from apex_C import scale_check_overflow
+
+# Python stopgap, until we get a future-proof kernel into upstream
+def scale_check_overflow(d_grads, scale):
+    # Exception handling for 18.04 compatibility
+    try:
+        cpu_sum = float(d_grads.float().sum())
+    except RuntimeError as instance:
+        if "value cannot be converted" not in instance.args[0]:
+            raise
+        return True
+    else:
+        if cpu_sum == float('inf') or cpu_sum == -float('inf') or cpu_sum != cpu_sum:
+            return True
+        d_grads.mul_(scale)
+        return False

 class LossScaler(object):
    def __init__(self):
@@ -8,20 +23,23 @@ class LossScaler(object):
        self._max_loss_scale = 2.**24
        self._scale_seq_len = 2000
        self._unskipped = 0
-        self._overflow_buf = torch.cuda.ByteTensor(1024,)
+        self._has_overflow = False
+        # self._overflow_buf = torch.cuda.ByteTensor(1024,)

    def loss_scale(self):
        return self._loss_scale

    def unscale_and_update(self, param_groups, scale):
-        self._overflow_buf.zero_()
+        # self._overflow_buf.zero_()
+        self._has_overflow = False
        for p in iter_params(param_groups):
            if p.grad is not None:
-                scale_check_overflow(p.grad.data,
-                                     1. / scale,
-                                     self._overflow_buf)
+                self._has_overflow = scale_check_overflow(p.grad.data,
+                                                          1. / scale)
+            if self._has_overflow:  break

-        if self._overflow_buf.any():
+        # if self._overflow_buf.any():
+        if self._has_overflow:
            should_skip = True
            self._loss_scale /= 2.
            self._unskipped = 0

--- a/apex/fp16_utils/fused_weight_norm.py
+++ b/apex/fp16_utils/fused_weight_norm.py
 import torch
 from torch.autograd import Variable
 from torch.autograd.function import Function, once_differentiable
-import apex_C
-
-def check_contig_cuda(tensors, names):
-    for tensor, name in zip(tensors, names):
-        if not tensor.is_contiguous():
-            raise RuntimeError(name+" with size {} is not contiguous"
-                               .format(tensor.size()))
-        if not tensor.is_cuda:
-            raise RuntimeError(name+".is_cuda = False."
-                               "Currently, only cuda tensors are supported.")

 class Fused_Weight_Norm(Function):
    """
-    Custom autograd function that implements weight norm, as presented in 
-    `<https://arxiv.org/abs/1602.07868>`_,
-    along a tensor's slowest or 
-    fastest dimension using fused kernel launches for the forward and backward passes.
-    Accepts fp32 or fp16 input; the output type will match the input type.
-    Within the kernels, all calculations are performed in fp32 for numerical stability, regardless
-    of input/output precision.
+    We are refactoring our fused kernels to add to Pytorch core, so that Pytorch's built-in weightnorm
+    will use them transparently.  Please use Pytorch's built-in weightnorm implementation for now, to 
+    future-proof your code.
    """

    @staticmethod
-    def forward(ctx, input, g, dim=0):
-        """
-        Args:
-            input(torch.cuda.FloatTensor or torch.cuda.HalfTensor):  input tensor corresponding to **v** in the paper.  ``input`` should be contiguous.
-            g(torch.cuda.FloatTensor or torch.cuda.HalfTensor):  input tensor corresponding to **g** in the paper.  ``g`` should be the same type as ``input``.
-            dim(int, optional, default=0):  Dimension across which to perform weightnorm.  Currently, only the first or last dimension of the input tensor is supported.
-
-        Returns:
-            Output tensor corresponding to **w** in the paper.  Output type and precision will match
-            type and precision of ``input``.
-        
-        """
-        # torch.cuda.nvtx.range_push("FusedNorm.forward, input.size() = {}"
-        #                            .format(input.size()))
-
-        check_contig_cuda((input,g),("input","g"))
-
-        """
-        This is ok, new() treats a torch.Size object properly.
-        No need to unpack with an asterisk via new(*input.size()).
-        """
-        output = input.new(input.size()).contiguous()
-
-        """
-        For output with size (slow, faster, faster, ...fastest), we want
-        norms with size (slow, 1, 1, ...1), so that if you want retrieve norms 
-        and apply the same normalizing factors to another Tensor "t" with the 
-        same size as output, "t/norms" will broadcast each element of norms 
-        across the corresponding slowest dim of t.
-        """
-        if dim == 0:
-            norm_size = (output.size(0),) + (1,)*(output.dim() - 1)
-        elif dim == output.dim() - 1:
-            norm_size = (1,)*(output.dim() - 1) + (output.size(-1),)
-        else:
-            raise RuntimeError("Currently, Fused_Weight_Norm only supports first or last dimension.")
-
-        norms = torch.cuda.FloatTensor(*norm_size).contiguous()
-        """
-        Beware:  If you call the following:
-        norms = torch.cuda.FloatTensor(norm_size).contiguous()
-        the constructor sees a tuple:
-        FloatTensor( (output_size(0),1,1,...) )
-        and creates a 1D tensor with values from the tuple:
-        [output_size(0),1,1,...].
-        """
-
-        apex_C.weight_norm_fwd(output, norms, input, g, dim)
-        ctx.save_for_backward(input, g)
-
-        # save_for_backward can only save input or output tensors,
-        # use ctx state to save the norms and dimension:
-        ctx.norms = norms
-        ctx.dim = dim
-
-        return output
+    def forward(ctx):
+        raise NotImplementedError("Use Pytorch's built-in weightnorm implementation. "+
+                                  "We are in the process of adding our fused kernels to Pytorch core, "+
+                                  "so Pytorch's built-in weightnorm will use them transparently.")

    @staticmethod
    @once_differentiable
    def backward(ctx, grad_output):
-        """
-        Args:
-            grad_output(torch.cuda.FloatTensor or torch.cuda.HalfTensor):  Gradient of loss with respect to output **w**. ``grad_output`` should be contiguous for performance.
-
-        Returns:
-            Gradient of loss with respect to ``input`` and ``g``.  The precision of these gradients will match the precision of ``grad_input``.
-        """
-        check_contig_cuda((grad_output), ("grad_output"))
-
-        savedInput, savedg = ctx.saved_tensors
-        savedNorms = ctx.norms
-
-        # We expect that these .contiguous() calls will be no-ops.  They're present for safety.
-        grad_output_contig = grad_output.contiguous()
-
-        grad_input = grad_output_contig.new(grad_output.size()).contiguous()
-        grad_g = savedg.new(savedg.size()).contiguous()
-
-        apex_C.weight_norm_bwd(grad_input, 
-                                grad_g,
-                                grad_output_contig, 
-                                savedInput, 
-                                savedg,
-                                savedNorms,
-                                ctx.dim)
-
-        return grad_input, grad_g, None
+        raise NotImplementedError("Use Pytorch's built-in weightnorm implementation. "+
+                                  "We are in the process of adding our fused kernels to Pytorch core, "+
+                                  "so Pytorch's built-in weightnorm will use them transparently.")
--- a/csrc/CUDATypeConversion.cuh
+++ b/csrc/CUDATypeConversion.cuh
-#pragma once
-
-#include <ATen/Half.h>
-#include <ATen/cuda/CUDAHalf.cuh>
-
-// Type traits to convert types to CUDA-specific types. Used primarily to
-// convert at::Half to CUDA's half type. This makes the conversion explicit.
-
-// Disambiguate from whatever is in aten
-namespace apex { namespace cuda {
-template <typename T>
-struct TypeConversion {
-  using type = T;
-};
-
-template <>
-struct TypeConversion<at::Half> {
-  using type = half;
-};
-
-template <typename T>
-using type = typename TypeConversion<T>::type;
-}} // namespace apex::cuda
--- a/csrc/interface.cpp
+++ b/csrc/interface.cpp
-#include <torch/torch.h>
-
-// Ideally, I'd like to call this file "weight_norm.cu" and put the interface and the implementation
-// here, but I can't make nvcc play well with torch.h.  For now, use a layer of indirection 
-// and separate .cu implementation files.
-
-// If we want everything to be part of "apex_C", we need all the interface functions defined 
-// in this file, or linker will complain about "multiple definitions of PyInit".
-// TODO:  multiple modules?
-
-// TODO:  modify fwd+bwd calls to return a tuple of Tensors.  This will require changing the
-// Python client code as well.  For now, get things working with the same Python-side API.
-
-void weight_norm_fwd_cuda
-  (const at::Tensor& w,
-   const at::Tensor& norms,
-   const at::Tensor& v,
-   const at::Tensor& g,
-   int dim);
-
-void weight_norm_fwd
-  (at::Tensor w, 
-   at::Tensor norms, 
-   at::Tensor v, 
-   at::Tensor g, 
-   int dim) 
-{
-  weight_norm_fwd_cuda(w, norms, v, g, dim);
-}
-
-void weight_norm_bwd_cuda
-  (const at::Tensor& pLpv,
-   const at::Tensor& pLpg,
-   const at::Tensor& pLpw,
-   const at::Tensor& savedv,
-   const at::Tensor& savedg,
-   const at::Tensor& savedNorms,
-   int dim);
-
-void weight_norm_bwd
-  (at::Tensor pLpv, 
-   at::Tensor pLpg, 
-   at::Tensor pLpw, 
-   at::Tensor savedv, 
-   at::Tensor savedg,
-   at::Tensor savedNorms,
-   int dim)
-{
-  weight_norm_bwd_cuda(pLpv, pLpg, pLpw, savedv, savedg, savedNorms, dim);
-}
-
-void scale_check_overflow_cuda
-  (const at::Tensor& d_grads, 
-   float scale,
-   const at::Tensor& d_buf);
-
-#ifdef VERSION_LE_04
-#define VERSION_AGNOSTIC_CHECK AT_ASSERT
-#else
-#define VERSION_AGNOSTIC_CHECK AT_CHECK
-#endif
-
-void scale_check_overflow
-  (at::Tensor grads,
-   float scale,
-   at::Tensor overflow_buf)
-{ 
-  VERSION_AGNOSTIC_CHECK
-    (grads.type().is_cuda(), "x must be a CUDA tensor");
-  VERSION_AGNOSTIC_CHECK
-    (overflow_buf.type().is_cuda(), "y must be a CUDA tensor");
-  // Make sure we are downscaling the FP32 master grads
-  VERSION_AGNOSTIC_CHECK
-    (grads.type().scalarType() == at::ScalarType::Float, 
-     "grads supplied to scale_check_overflow should be fp32 (master grads).")
-  scale_check_overflow_cuda(grads, scale, overflow_buf);
-}
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def("weight_norm_fwd", &weight_norm_fwd, "Fused weight norm, forward pass");
-  m.def("weight_norm_bwd", &weight_norm_bwd, "Fused weight norm, backward pass");
-  m.def("scale_check_overflow", &scale_check_overflow, "Fused overflow check + scale for FP32 tensors");
-}
--- a/csrc/kernel_utils.cuh
+++ b/csrc/kernel_utils.cuh
-#include <cuda.h>
-#include <cuda_runtime.h>
-
-// Lock in a local version of CUDATypeConversion.cuh
-#include "CUDATypeConversion.cuh"
-
-#include <THC/THCNumerics.cuh>
-
-#if __CUDACC_VER_MAJOR__ >= 9
-#define __SHFL_DOWN(var, delta)  __shfl_down_sync(0xffffffff, var, delta)
-#else
-#define __SHFL_DOWN(var, delta)  __shfl_down(var, delta)
-#endif
-
-#if __CUDACC_VER_MAJOR__ >= 9
-#define __SYNCWARP __syncwarp()
-#else
-#define __SYNCWARP 
-#endif
-
-// not a long term solution, need to get this code into upstream.
-#ifdef VERSION_LE_04                                                        
-#define USING_ACCSCALAR_T using accscalar_t = cuda::acc_type<cuda_scalar_t>;
-#else                                                                        
-#define USING_ACCSCALAR_T using accscalar_t = acc_type<cuda_scalar_t, true>; 
-#endif                                                                       
-
-// Block size for weight_norm_*_first_dim_kernel.
-// Currently, kernels are non-persistent.
-// Dialing up the block size to, say 1024, can improve performance by
-// increase the amount of cache available per block, which can improve cache hit rate.
-// However, this is less efficient for short rows.  256 is pretty versatile. 
-// May be worth implementing heuristics later.
-#define BLOCK 256
-
-// Block size for weight_norm_*_last_dim_kernel.
-// This is tricker than the first_dim case because we must make blocks 
-// at least 16 fast elements wide to ensure fully-coalesced half-precision accesses.
-// Since output-element parallelism is along the fast dimension, this reduces the number of 
-// blocks we can launch by 16X.  
-#define TILE_W 16
-// Somewhat versatile strategy: max out intra-block parallelism by extending
-// blocks across the slow dimension up to the hardware-max block size of 1024.
-#define TILE_H 64
-
-// Lock in a local version of ReduceAdd, copied from THCTensorMathReduce.cuh:
-template <typename T>
-struct ReduceAdd {
-  inline __device__ T operator()(const T a, const T b) const {
-    return THCNumerics<T>::add(a, b);
-  }
-};
-
-// lanes is intended to be <= 32.
-template 
-  <typename T, 
-   typename ReduceOp>
-__device__ __forceinline__ void reduce_block_into_lanes
-  (T *x, 
-   T val, 
-   int lanes,
-   ReduceOp reduceOp) 
-{ 
-  int tid = threadIdx.x + threadIdx.y*blockDim.x;
-  int blockSize = blockDim.x*blockDim.y;
-
-  if(blockSize >= 64)
-  {
-    x[tid] = val;
-    __syncthreads();
-  }
-  
-  #pragma unroll
-  for(int i = (blockSize >> 1); i >= 64; i >>= 1) 
-  {
-    if(tid < i)
-      x[tid] = reduceOp(x[tid], x[tid+i]);
-    __syncthreads();
-  }
-
-  if(tid < 32) 
-  {
-    T final;
-    if(blockSize >= 64)
-      final = reduceOp(x[tid], x[tid+32]);
-    else
-      final = val;
-    // __SYNCWARP();
-
-    #pragma unroll
-    for(int i = 16; i >= lanes; i >>= 1)
-      final = reduceOp(final, __SHFL_DOWN(final, i));
-
-    if(tid < lanes) 
-      x[tid] = final; // EpilogueOp
-  }
-
-  // Make sure the smem result is visible to all warps.
-  __syncthreads();
-}
--- a/csrc/scale_cuda.cu
+++ b/csrc/scale_cuda.cu
-#include <ATen/ATen.h>
-// #include "ATen/AccumulateType.h"
-#include "ATen/cuda/CUDATensorMethods.cuh"
-// #include "ATen/cuda/CUDATypeConversion.cuh"
-// #include <THC/THCTensorMathReduce.cuh>
-#include <THC/THCGeneral.h>
-
-#include <assert.h>
-
-#define BLOCK_SIZE 1024
-#define MAX_BLOCKS 1024
-
-// It makes sense to lock the type to "float" here because the downscaling
-// should only be applied to the FP32 master gradients.  Also, if "in" were 
-// a different type, it would require divergent code for the vectorized load logic.
-
-// TODO:  
-// Update overflow check to use reduction from kernel_utils.cuh with 
-// ReduceOp from THCTensorMathReduce.cuh.
-__global__ void scale_reduce_overflow
-  (float *in, 
-   size_t n, 
-   float scale,
-   uint8_t *overflow_out) 
-{
-    __shared__ uint8_t cta_overflow[BLOCK_SIZE];
-
-    int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    int stride = gridDim.x * blockDim.x;
-
-    uint8_t my_overflow = 0;
-    for (int i = tid * 4; i < n; i+= stride * 4) {
-        if (i < (n - 3)) {
-            float4 f4 = ((float4*)in)[i / 4];
-            if (isfinite(f4.x)) {
-                f4.x *= scale;
-            } else {
-                my_overflow = 1;
-            }
-            if (isfinite(f4.y)) {
-                f4.y *= scale;
-            } else {
-                my_overflow = 1;
-            }
-            if (isfinite(f4.z)) {
-                f4.z *= scale;
-            } else {
-                my_overflow = 1;
-            }
-            if (isfinite(f4.w)) {
-                f4.w *= scale;
-            } else {
-                my_overflow = 1;
-            }
-            ((float4*)in)[i / 4] = f4;
-        } else {
-            for (; i < n; ++i) {
-                if (isfinite(in[i])) {
-                    in[i] *= scale;
-                } else {
-                    my_overflow = 1;
-                }
-            }
-        }
-    }
-
-    int tIdx = threadIdx.x;
-    cta_overflow[tIdx] = my_overflow;
-    __syncthreads();
-
-    int participating = BLOCK_SIZE / 2;
-    while (participating > 0) {
-        if (tIdx < participating) {
-            cta_overflow[tIdx] = max(cta_overflow[tIdx],
-                                     cta_overflow[tIdx + participating]);
-        }
-        participating /= 2;
-        __syncthreads();
-    }
-    if (tIdx == 0) {
-        overflow_out[blockIdx.x] = max(cta_overflow[0],
-                                       overflow_out[blockIdx.x]);
-    }
-}
-
-void scale_check_overflow_cuda
-  (const at::Tensor& d_grads, 
-   float scale,
-   const at::Tensor& d_buf) 
-{
-  using namespace at;
-  cudaStream_t stream = globalContext().getCurrentCUDAStream();
-  
-  size_t n = d_grads.numel();
-  size_t buf_n = d_buf.numel();
-
-  int num_blks = min((int(n) + BLOCK_SIZE - 1) / BLOCK_SIZE,
-                     MAX_BLOCKS);
-  assert(buf_n >= num_blks);
-  scale_reduce_overflow<<<num_blks, BLOCK_SIZE, 0, stream>>>
-    (d_grads.data<float>(), 
-     n, 
-     scale, 
-     d_buf.data<uint8_t>());
-  THCudaCheck(cudaGetLastError());
-}
-
--- a/csrc/weight_norm_bwd_cuda.cu
+++ b/csrc/weight_norm_bwd_cuda.cu
-#include "kernel_utils.cuh"
-
-#include <ATen/ATen.h>
-
-#ifdef VERSION_LE_04
-#include "ATen/cuda/AccumulateType.cuh"
-#else
-#include "ATen/AccumulateType.h"
-#endif
-
-#include "ATen/cuda/CUDATensorMethods.cuh"
-// #include "ATen/cuda/CUDATypeConversion.cuh"
-// #include <THC/THCTensorMathReduce.cuh>
-
-template
-  <typename scalar_t, 
-   typename accscalar_t>
-__global__ void weight_norm_bwd_first_dim_kernel
-  (scalar_t* __restrict__ pLpv,
-   scalar_t* __restrict__ pLpg,
-   const scalar_t* __restrict__ pLpw,
-   const scalar_t* __restrict__ savedv,
-   const scalar_t* __restrict__ savedg,
-   const accscalar_t* __restrict__ savedNorms,
-   const int rowSize)
-{
-  // For now, assign one block to each row.
-  const int tid = threadIdx.x;
-  const int row = blockIdx.x;
-  const int stride = blockDim.x;
-
-  // Logical index offset for this flattened row
-  const int rowStart = row*rowSize;
-
-  // Hack to get around nvcc complaining when an smem array is declared with the same name
-  // but different types in different kernels (in this case different instantiations)
-  // extern __shared__ accscalar_t s[]; // error: declaration is incompatible with previous "s"
-  extern __shared__ char buf[];
-  accscalar_t* s = (accscalar_t*)buf;
-  
-  accscalar_t thread_sum = 0.f;
-  for(int i = tid; i < rowSize; i += stride ) 
-  {
-    accscalar_t pLpwi = scalar_cast<accscalar_t>(pLpw[i+rowStart]); 
-    accscalar_t savedvi = scalar_cast<accscalar_t>(savedv[i+rowStart]); 
-    thread_sum += pLpwi*savedvi; // AccumOp, could do Kahan here
-  }
-
-  reduce_block_into_lanes(s, thread_sum, 1, ReduceAdd<accscalar_t>());
-  accscalar_t result = s[0];
-
-  // Could choose to save reciprocal of norm instead I suppose, but norms is probably
-  // more handy to keep around.
-  // Broadcast load; could use shared memory instead.
-  accscalar_t rnorm = 1.f/savedNorms[row];  
-  accscalar_t rnorm3 = rnorm*rnorm*rnorm;
-
-  // Write g gradients.
-  if(tid == 0)
-    pLpg[row] = scalar_cast<scalar_t>(result*rnorm);
-
-  // Broadcast load, could use shared memory instead.
-  accscalar_t g_this_row = scalar_cast<accscalar_t>(savedg[row]);
-   
-  // Write v gradients.  We are reusing values that were loaded earlier, so there 
-  // is an optimization opportunity here (store values persistently).
-  for(int j = tid; j < rowSize; j += stride ) 
-  {
-    accscalar_t pLpwj = scalar_cast<accscalar_t>(pLpw[j+rowStart]);  
-    accscalar_t savedvj = scalar_cast<accscalar_t>(savedv[j+rowStart]);  
-    accscalar_t pLpvj = g_this_row*(rnorm*pLpwj - rnorm3*savedvj*result);
-    pLpv[j+rowStart] = scalar_cast<scalar_t>(pLpvj);
-  }
-}
-
-template 
-  <typename scalar_t, 
-   typename accscalar_t>
-__global__ void weight_norm_bwd_last_dim_kernel
-  (scalar_t* __restrict__ pLpv,
-   scalar_t* __restrict__ pLpg,
-   const scalar_t* __restrict__ pLpw,
-   const scalar_t* __restrict__ savedv,
-   const scalar_t* __restrict__ savedg,
-   const accscalar_t* __restrict__ savedNorms,
-   const int fast_dim_size,
-   const int slower_dims_size)
-{
-  const int fast_dim_location = threadIdx.x + blockIdx.x*blockDim.x;
-
-  extern __shared__ char buf[];
-  accscalar_t* s = (accscalar_t*)buf;
-
-  accscalar_t thread_sum = 0.f;
-
-  int slower_dims_location = threadIdx.y;
-  int currentIdx = fast_dim_location + fast_dim_size*slower_dims_location;
-  if(fast_dim_location < fast_dim_size)
-    while(slower_dims_location < slower_dims_size)
-    {
-      accscalar_t pLpwi = scalar_cast<accscalar_t>(pLpw[currentIdx]); 
-      accscalar_t savedvi = scalar_cast<accscalar_t>(savedv[currentIdx]); 
-      thread_sum += pLpwi*savedvi; // AccumOp, could do Kahan here
-      currentIdx += blockDim.y*fast_dim_size;
-      slower_dims_location += blockDim.y; 
-    }
-
-  reduce_block_into_lanes(s, thread_sum, blockDim.x, ReduceAdd<accscalar_t>()); 
-  accscalar_t result = s[threadIdx.x];
-
-  // Broadcast load; could use shared memory instead.
-  accscalar_t rnorm = 1.f/savedNorms[fast_dim_location];  
-  accscalar_t rnorm3 = rnorm*rnorm*rnorm;
-
-  // Write g gradients.
-  if(threadIdx.y == 0)
-    pLpg[fast_dim_location] = scalar_cast<scalar_t>(result*rnorm);
-
-  // Entire block pulls these values, could use shared memory instead.
-  accscalar_t g_this_col = scalar_cast<accscalar_t>(savedg[fast_dim_location]);
-
-  // Write v gradients.
-  slower_dims_location = threadIdx.y;
-  currentIdx = fast_dim_location + fast_dim_size*slower_dims_location;
-  if(fast_dim_location < fast_dim_size)
-    while(slower_dims_location < slower_dims_size)
-    {
-      accscalar_t pLpwj = scalar_cast<accscalar_t>(pLpw[currentIdx]);  
-      accscalar_t savedvj = scalar_cast<accscalar_t>(savedv[currentIdx]);  
-      accscalar_t pLpvj = g_this_col*(rnorm*pLpwj - rnorm3*savedvj*result);
-      pLpv[currentIdx] = scalar_cast<scalar_t>(pLpvj);
-      currentIdx += blockDim.y*fast_dim_size;
-      slower_dims_location += blockDim.y; 
-    } 
-}
-
-void weight_norm_bwd_cuda
-  (const at::Tensor& pLpv,
-   const at::Tensor& pLpg,
-   const at::Tensor& pLpw,
-   const at::Tensor& savedv,
-   const at::Tensor& savedg,
-   const at::Tensor& savedNorms,
-   int dim)
-{
-#ifdef DEBUG_ANY
-  using namespace std;
-  cout << "Hello from send_to_bwd with pLpw.type() = " << pLpw.type() << endl;
-#endif
-
-  const int ndims = savedv.ndimension();
-
-  if(dim == 0) 
-  {
-    // Find logical size of each flattened slowest-dim row
-    int rowSize = 1;
-    for(int i = ndims - 1; i > 0; i--)
-      rowSize *= savedv.size(i);
-
-    using namespace at;
-    cudaStream_t stream = globalContext().getCurrentCUDAStream();
-    AT_DISPATCH_FLOATING_TYPES_AND_HALF
-      (savedv.type(), 
-       "weight_norm_bwd_first_dim_kernel",  
-       [&]
-       {
-         using cuda_scalar_t = apex::cuda::type<scalar_t>;
-         USING_ACCSCALAR_T
-
-	 weight_norm_bwd_first_dim_kernel
-	   <<<pLpw.size(0), 
-	      BLOCK, 
-	      BLOCK*sizeof(accscalar_t),
-              stream>>>
-	   (pLpv.data<cuda_scalar_t>(),
-	    pLpg.data<cuda_scalar_t>(),
-	    pLpw.data<cuda_scalar_t>(),
-	    savedv.data<cuda_scalar_t>(),
-	    savedg.data<cuda_scalar_t>(),
-	    savedNorms.data<accscalar_t>(),
-	    rowSize);
-       });
-  }
-  else if(dim == ndims - 1)
-  {
-    // Precompute slower_dims_size and fast_dim_size because they involve dynamically indexing an array.
-    int slower_dims_size = 1;
-    for(int i = 0; i < ndims - 1; i++)
-      slower_dims_size *= savedv.size(i);
-
-    int fast_dim_size = savedv.size(ndims-1);
-
-    using namespace at;
-    cudaStream_t stream = globalContext().getCurrentCUDAStream();
-    AT_DISPATCH_FLOATING_TYPES_AND_HALF
-      (savedv.type(), 
-       "weight_norm_bwd_last_dim_kernel",  
-       [&]
-       {
-         using cuda_scalar_t = apex::cuda::type<scalar_t>;
-         USING_ACCSCALAR_T
-
-         weight_norm_bwd_last_dim_kernel
-           <<<(fast_dim_size+TILE_W-1)/TILE_W,
-              dim3(TILE_W,TILE_H), 
-              (TILE_W*TILE_H + TILE_W)*sizeof(accscalar_t),
-              stream>>>
-           (pLpv.data<cuda_scalar_t>(),
-            pLpg.data<cuda_scalar_t>(),
-            pLpw.data<cuda_scalar_t>(),
-            savedv.data<cuda_scalar_t>(),
-            savedg.data<cuda_scalar_t>(),
-            savedNorms.data<accscalar_t>(),
-            fast_dim_size,
-            slower_dims_size);
-       });
-  }
-  // else 
-  // {
-  //   intermediate dim kernel.  Error checking on the dim was already done in 
-  //   Module.cpp:weight_norm_bwd.  Could put that logic here instead, if we include
-  //   <python.h> in both files.
-  // }
-
-  // The kernel execution is asynchronous, so this will only catch errors on the kernel launch,
-  // not the kernel's execution.  Errors in kernel execution aren't guaranteed to be caught
-  // until a later error check on a synchronizing CUDA call.  Unfortunately, without manually 
-  // synchronizing here, this is the best we can do.
-  THCudaCheck(cudaGetLastError());
-
-#ifdef DEBUG_PROFILE
-  THCudaCheck(cudaDeviceSynchronize());
-#endif
-}
--- a/csrc/weight_norm_fwd_cuda.cu
+++ b/csrc/weight_norm_fwd_cuda.cu
-#include "kernel_utils.cuh"
-
-#include <ATen/ATen.h>
-
-#ifdef VERSION_LE_04
-#include "ATen/cuda/AccumulateType.cuh"
-#else
-#include "ATen/AccumulateType.h"
-#endif
-
-#include "ATen/cuda/CUDATensorMethods.cuh"
-// #include "ATen/cuda/CUDATypeConversion.cuh"
-// #include <THC/THCTensorMathReduce.cuh>
-
-template
-  <typename scalar_t, 
-   typename accscalar_t>
-__global__ void weight_norm_fwd_first_dim_kernel
-  (scalar_t* __restrict__ w,
-   accscalar_t* __restrict__ norms,
-   const scalar_t* __restrict__ v,
-   const scalar_t* __restrict__ g,
-   const int rowSize) 
-{
-  // We are norming each slowest-dim row of the tensor separately.
-  // For now, assign one block to each row.
-  const int tid = threadIdx.x;
-  const int row = blockIdx.x;
-  const int stride = blockDim.x;
-
-  // Logical index offset for this flattened row
-  const int rowStart = row*rowSize;
-
-  // Hack to get around nvcc complaining when an smem array is declared with the same name
-  // but different types in different kernels (in this case different instantiations)
-  // extern __shared__ accscalar_t s[]; // error: declaration is incompatible with previous "s"
-  extern __shared__ char buf[];
-  accscalar_t* s = (accscalar_t*)buf;
-  
-  accscalar_t thread_sum = 0.f;
-  for(int i = tid; i < rowSize; i += stride ) 
-  {
-    accscalar_t val_f = scalar_cast<accscalar_t>(v[i+rowStart]); 
-    thread_sum += val_f*val_f; // AccumOp, could do Kahan here
-  }
-
-  reduce_block_into_lanes(s, thread_sum, 1, ReduceAdd<accscalar_t>());
-  accscalar_t result = s[0];
-
-  result = sqrtf(result);
-  
-  if(tid == 0)
-    norms[row] = result;
-
-  // Broadcast load, could use shared memory instead.
-  accscalar_t g_this_row = scalar_cast<accscalar_t>(g[row]);
-
-  accscalar_t rnorm = 1.f/result; // for consistency with backward kernel
-
-  // Write data to output
-  for(int i = tid; i < rowSize; i += stride ) 
-  {
-    accscalar_t val_f = scalar_cast<accscalar_t>(v[i+rowStart]);
-    w[i+rowStart] = scalar_cast<scalar_t>(g_this_row*val_f*rnorm);
-  }
-}
-
-template
-  <typename scalar_t, 
-   typename accscalar_t>
-__global__ void weight_norm_fwd_last_dim_kernel
-(
-  scalar_t* __restrict__ w,
-  accscalar_t* __restrict__ norms,
-  const scalar_t* __restrict__ v,
-  const scalar_t* __restrict__ g,
-  const int fast_dim_size,
-  const int slower_dims_size
-)
-{
-  const int fast_dim_location = threadIdx.x + blockIdx.x*blockDim.x;
-
-  extern __shared__ char buf[];
-  accscalar_t* alloc = (accscalar_t*)buf;
-  accscalar_t* s = &alloc[0];
-  accscalar_t* rnorms_this_block = &alloc[blockDim.x*blockDim.y];
-
-  accscalar_t thread_sum = 0.f;
-
-  int slower_dims_location = threadIdx.y;
-  int currentIdx = fast_dim_location + fast_dim_size*slower_dims_location;
-  if(fast_dim_location < fast_dim_size)
-    while(slower_dims_location < slower_dims_size)
-    {
-      accscalar_t val_f = scalar_cast<accscalar_t>(v[currentIdx]); 
-      thread_sum += val_f*val_f; // AccumOp, could do Kahan here
-      currentIdx += blockDim.y*fast_dim_size;
-      slower_dims_location += blockDim.y; 
-    }
-
-  reduce_block_into_lanes(s, thread_sum, blockDim.x, ReduceAdd<accscalar_t>()); 
-
-  // Better to pass an EpilogueOp to reduce_block_into_lanes, implement later
-  if(threadIdx.y == 0)
-  {
-    accscalar_t result = s[threadIdx.x];
-    accscalar_t norm_this_col = sqrtf(result);
-    norms[fast_dim_location] = norm_this_col;
-    rnorms_this_block[threadIdx.x] = 1.f/norm_this_col;
-  }
-   
-  __syncthreads(); 
-
-  accscalar_t g_this_col = scalar_cast<accscalar_t>(g[fast_dim_location]);     
-  accscalar_t rnorm = rnorms_this_block[threadIdx.x]; 
-
-  slower_dims_location = threadIdx.y;
-  currentIdx = fast_dim_location + fast_dim_size*slower_dims_location;
-  if(fast_dim_location < fast_dim_size)
-    while(slower_dims_location < slower_dims_size)
-    {
-      accscalar_t val_f = scalar_cast<accscalar_t>(v[currentIdx]); 
-      w[currentIdx] = scalar_cast<scalar_t>(g_this_col*val_f*rnorm);
-      currentIdx += blockDim.y*fast_dim_size;
-      slower_dims_location += blockDim.y; 
-    } 
-}
-
-
-void weight_norm_fwd_cuda
-  (const at::Tensor& w,
-   const at::Tensor& norms,
-   const at::Tensor& v,
-   const at::Tensor& g,
-   int dim)
-{
-#ifdef DEBUG_ANY
-  using namespace std;
-  cout << "hello from send_to_fwd with v.type() = " << v.type() << endl;
-#endif
-
-  const int ndims = v.ndimension();
-
-  if(dim == 0) 
-  {
-    // Find logical size of each flattened slowest-dim row
-    int rowSize = 1;
-    for(int i = ndims - 1; i > 0; i--)
-      rowSize *= v.size(i);
-
-    using namespace at;
-    cudaStream_t stream = globalContext().getCurrentCUDAStream();
-    AT_DISPATCH_FLOATING_TYPES_AND_HALF
-      (v.type(), 
-       "weight_norm_fwd_first_dim_kernel",  
-       [&]
-       {
-         using cuda_scalar_t = apex::cuda::type<scalar_t>;
-         USING_ACCSCALAR_T
-
-         weight_norm_fwd_first_dim_kernel
-           <<<v.size(0), 
-              BLOCK, 
-              BLOCK*sizeof(accscalar_t),
-              stream>>>
-           (w.data<cuda_scalar_t>(), 
-            norms.data<accscalar_t>(),
-            v.data<cuda_scalar_t>(),  
-            g.data<cuda_scalar_t>(),  
-            rowSize);
-       });
-  }
-  else if(dim == ndims - 1)
-  {
-    // Precompute slower_dims_size and fast_dim_size
-    int slower_dims_size = 1;
-    for(int i = 0; i < ndims - 1; i++)
-      slower_dims_size *= v.size(i);
-
-    int fast_dim_size = v.size(ndims-1);
- 
-    using namespace at;
-    cudaStream_t stream = globalContext().getCurrentCUDAStream();
-    AT_DISPATCH_FLOATING_TYPES_AND_HALF
-      (v.type(), 
-       "weight_norm_fwd_last_dim_kernel",  
-       [&]
-       {
-         using cuda_scalar_t = apex::cuda::type<scalar_t>;
-         USING_ACCSCALAR_T
-        
-         // just trying this formatting out to see how it feels... 
-         weight_norm_fwd_last_dim_kernel
-           <<<(fast_dim_size+TILE_W-1)/TILE_W,
-              dim3(TILE_W,TILE_H),
-              (TILE_W*TILE_H + TILE_W)*sizeof(accscalar_t),
-              stream>>>
-           (w.data<cuda_scalar_t>(),
-            norms.data<accscalar_t>(),
-            v.data<cuda_scalar_t>(),
-            g.data<cuda_scalar_t>(),
-            fast_dim_size,
-            slower_dims_size);
-       });
-  }
-  // else 
-  // {
-  //   intermediate dim kernel.  Error checking on the dim was already done in 
-  //   Module.cpp:weight_norm_fwd.  Could put that logic here instead, if we include
-  //   <python.h> in both files.
-  // }
-
-  // The kernel execution is asynchronous, so this will only catch errors on the kernel launch,
-  // not the kernel's execution.  Errors in kernel execution aren't guaranteed to be caught
-  // until a later error check on a synchronizing CUDA call.  Unfortunately, without manually 
-  // synchronizing here, this is the best we can do.
-  THCudaCheck(cudaGetLastError());
-
-#ifdef DEBUG_PROFILE
-  THCudaCheck(cudaDeviceSynchronize());
-#endif
-}
-
--- a/setup.py
+++ b/setup.py
-import torch.cuda
-import ctypes.util
-import os
-import re
-import subprocess
+import torch
 from setuptools import setup, find_packages
-from distutils.command.clean import clean
-from torch.utils.cpp_extension import CUDAExtension, CUDA_HOME
-
-# TODO:  multiple modules, so we don't have to route all interfaces through
-# the same interface.cpp file?

 if not torch.cuda.is_available():
    print("Warning: Torch did not find available GPUs on this system.\n",
@@ -22,111 +13,6 @@ if TORCH_MAJOR == 0 and TORCH_MINOR < 4:
      raise RuntimeError("APEx requires Pytorch 0.4 or newer.\n" +
                         "The latest stable release can be obtained from https://pytorch.org/")

-version_le_04 = []
-if TORCH_MAJOR == 0 and TORCH_MINOR == 4:
-    version_le_04 = ['-DVERSION_LE_04']
-
-def find(path, regex_func, collect=False):
-    collection = [] if collect else None
-    for root, dirs, files in os.walk(path):
-        for file in files:
-            if regex_func(file):
-                if collect:
-                    collection.append(os.path.join(root, file))
-                else:
-                    return os.path.join(root, file)
-    return list(set(collection))
-
-# Due to https://github.com/pytorch/pytorch/issues/8223, for Pytorch <= 0.4
-# torch.utils.cpp_extension's check for CUDA_HOME fails if there are no GPUs
-# available on the system, which prevents cross-compiling and building via Dockerfiles.
-# Workaround:  manually search for CUDA_HOME if Pytorch <= 0.4.
-def find_cuda_home():
-    cuda_path = None
-    CUDA_HOME = None
-
-    CUDA_HOME = os.getenv('CUDA_HOME', '/usr/local/cuda')
-    if not os.path.exists(CUDA_HOME):
-        # We use nvcc path on Linux and cudart path on macOS
-        cudart_path = ctypes.util.find_library('cudart')
-        if cudart_path is not None:
-            cuda_path = os.path.dirname(cudart_path)
-        if cuda_path is not None:
-            CUDA_HOME = os.path.dirname(cuda_path)
-            
-    if not cuda_path and not CUDA_HOME:
-        nvcc_path = find('/usr/local/', re.compile("nvcc").search, False)
-        if nvcc_path:
-            CUDA_HOME = os.path.dirname(nvcc_path)
-            if CUDA_HOME:
-                os.path.dirname(CUDA_HOME)
-
-        if (not os.path.exists(CUDA_HOME+os.sep+"lib64")
-            or not os.path.exists(CUDA_HOME+os.sep+"include") ):
-            raise RuntimeError("Error: found NVCC at ", 
-                               nvcc_path,
-                               " but could not locate CUDA libraries"+
-                               " or include directories.")
-        
-        raise RuntimeError("Error: Could not find cuda on this system. " +
-                            "Please set your CUDA_HOME environment variable "
-                            "to the CUDA base directory.")
-
-    return CUDA_HOME
-
-if TORCH_MAJOR == 0 and TORCH_MINOR == 4:
-    if CUDA_HOME is None:
-        CUDA_HOME = find_cuda_home()
-        # Patch cpp_extension's view of CUDA_HOME:
-        torch.utils.cpp_extension.CUDA_HOME = CUDA_HOME
-
-def get_cuda_version():
-    NVCC = find(CUDA_HOME+os.sep+"bin",
-                re.compile('nvcc$|nvcc.exe').search)
-    print("Found NVCC = ", NVCC)
-
-    # Parse output of nvcc to get cuda major version
-    nvcc_output = subprocess.check_output([NVCC, '--version']).decode("utf-8")
-    CUDA_LIB = re.compile(', V[0-9]+\.[0-9]+\.[0-9]+').search(nvcc_output).group(0).split('V')[1]
-    print("Found CUDA_LIB = ", CUDA_LIB)
-
-    CUDA_MAJOR = int(CUDA_LIB.split('.')[0])
-    print("Found CUDA_MAJOR = ", CUDA_MAJOR)
-
-    if CUDA_MAJOR < 8:
-        raise RuntimeError("APex requires CUDA 8.0 or newer")
-
-    return CUDA_MAJOR
-
-if CUDA_HOME is not None:
- 
-    print("Found CUDA_HOME = ", CUDA_HOME)
-
-    CUDA_MAJOR = get_cuda_version()
-
-    gencodes = ['-gencode', 'arch=compute_50,code=sm_50',
-                '-gencode', 'arch=compute_52,code=sm_52',
-                '-gencode', 'arch=compute_60,code=sm_60',
-                '-gencode', 'arch=compute_61,code=sm_61',]
-
-    if CUDA_MAJOR > 8:
-        gencodes += ['-gencode', 'arch=compute_70,code=sm_70',
-                     '-gencode', 'arch=compute_70,code=compute_70',]
-
-    ext_modules = []
-    extension = CUDAExtension(
-        'apex_C', [
-            'csrc/interface.cpp',
-            'csrc/weight_norm_fwd_cuda.cu',
-            'csrc/weight_norm_bwd_cuda.cu',
-            'csrc/scale_cuda.cu',
-        ],
-        extra_compile_args={'cxx': ['-g'] + version_le_04,
-                            'nvcc': ['-O3'] + version_le_04 + gencodes})
-    ext_modules.append(extension)
-else:
-    raise RuntimeError("Could not find Cuda install directory")
-
 setup(
    name='apex',
    version='0.1',
@@ -139,7 +25,5 @@ setup(
                                    'tests',
                                    'examples',
                                    'apex.egg-info',)),
-    ext_modules=ext_modules,
    description='PyTorch Extensions written by NVIDIA',
-    cmdclass={'build_ext': torch.utils.cpp_extension.BuildExtension},
 )
--- a/tests/raw_ops/compare.py
+++ b/tests/raw_ops/compare.py
-import torch
-import numpy as np
-
-def compare(cuda_out, pt_out, pt_out_control, rows):
-
-    # print(                    "Pytorch ops in fp16:  ", pt_out        )
-    # print(                          "Kernel result:  ", cuda_out      )
-    # print("Control (Pytorch ops, sticking to fp32):  ", pt_out_control)
-    
-    # Make upconverted copies for error check against fp32 control
-    cuda_out_fp32 = cuda_out.float()
-    pt_out_fp32 = pt_out.float()
-    
-    # Flatten all but the slowest dimension
-    cuda_out       =       cuda_out.view(rows,-1)
-    pt_out         =         pt_out.view(rows,-1)
-    cuda_out_fp32  =  cuda_out_fp32.view(rows,-1)
-    pt_out_fp32    =    pt_out_fp32.view(rows,-1)
-    pt_out_control = pt_out_control.view(rows,-1)
-   
-    cuda_maxdiffs, cuda_maxdiff_locs = torch.max((pt_out_control - cuda_out_fp32).abs(),1)
-    pt_maxdiffs, pt_maxdiff_locs     = torch.max((pt_out_control - pt_out_fp32  ).abs(),1)
-    
-    print(    "cuda_maxdiffs = ", cuda_maxdiffs    )
-    # print("cuda_maxdiff_locs = ", cuda_maxdiff_locs)
-    print(      "pt_maxdiffs = ", pt_maxdiffs      )
-    # print(  "pt_maxdiff_locs = ", pt_maxdiff_locs  )
-    
-    row_indices = torch.LongTensor(np.arange(rows))
-    
-    # print("cuda_out at cuda_maxdiff_locs in each row:")
-    # # bizarrely, this will work if you do it at the python prompt:
-    # # print(cuda_out[row_indices,cuda_maxdiff_locs])
-    # # ...but it only seems to work here if you wrap with numpy arrays:
-    # print(      cuda_out[np.array(row_indices),np.array(cuda_maxdiff_locs)])
-    # print("pt_out_control at cuda_maxdiff_locs in each row:")
-    # print(pt_out_control[np.array(row_indices),np.array(cuda_maxdiff_locs)])
-    # 
-    # print("pt_out at pt_maxdiff_locs in each row:"          )
-    # print(        pt_out[np.array(row_indices),np.array(pt_maxdiff_locs)])
-    # print("pt_out_control at pt_maxdiff_locs in each row:"  )
-    # print(pt_out_control[np.array(row_indices),np.array(pt_maxdiff_locs)])
--- a/tests/raw_ops/norm.py
+++ b/tests/raw_ops/norm.py
-import torch
-
-def get_norm_shape(p, dim):
-    if dim == 0:
-        output_size = (p.size(0),) + (1,) * (p.dim() - 1)
-        return output_size
-    elif dim == p.dim() - 1:
-        output_size = (1,) * (p.dim() - 1) + (p.size(-1),)
-        return output_size
-    return None
-
-def pt_norm(p, dim):
-    """Computes the norm over all dimensions except dim"""
-    if dim is None:
-        return p.norm()
-    elif dim == 0:
-        return p.contiguous().view(p.size(0), -1).norm(2,dim=1).view(*get_norm_shape(p, dim))
-    elif dim == p.dim() - 1:
-        return p.contiguous().view(-1, p.size(-1)).norm(2,dim=0).view(*get_norm_shape(p, dim))
-    return pt_norm(p.transpose(0, dim), 0).transpose(0, dim)
--- a/tests/raw_ops/test_autograd.py
+++ b/tests/raw_ops/test_autograd.py
-import torch
-from torch.autograd import Variable
-from apex.fp16_utils import Fused_Weight_Norm
-from compare import compare
-from norm import pt_norm, get_norm_shape
-
-torch.manual_seed(2)
-torch.cuda.manual_seed(2)
-# torch.cuda.manual_seed_all(2)
-torch.set_printoptions(precision=10)
-
-rows = 321 # 1    
-cols = 33  # 4096 
-fast = 185 # 4096 
-dims = rows, cols, fast
-
-dim = 0
-CUDA_HALF = True
-RAND      = True # If false, input gradients (the result of the backward pass) 
-                 # should be analytically zero.
-
-# Loss will be computed via (output*elementwise).sum().
-# This means that output gradients in the backward pass will be equal
-# to elementwise, so by manipulating elementwise, we have easy 
-# fine-grained control over the output gradients we'd like to use for
-# testing purposes.
-# 
-# The alternative is just to create the output_gradients manually 
-# and call output.backward(gradient=output_gradients), 
-# as is done in test_backward.py.
-# But I wanted a minimal working sample similar to an "actual" use case, 
-# where gradients are computed by calling backward() on a scalar Loss.
-
-if RAND:
-    # With std=6.0, I observe the pytorch fp16 ops going unstable (sometimes)
-    # while the fused kernel remains stable.
-    pt_in_fp32       = torch.cuda.FloatTensor(*dims      ).normal_(std=1.0)
-    norm_shape = get_norm_shape(pt_in_fp32, dim)
-    pt_g_fp32        = torch.cuda.FloatTensor(*norm_shape).normal_(std=1.0)
-    elementwise_fp32 = torch.cuda.FloatTensor(*dims      ).normal_(std=1.0)
-else:
-    pt_in_fp32       = torch.cuda.FloatTensor(*dims      ).fill_(1.0)
-    norm_shape = get_norm_shape(pt_in_fp32, dim)
-    pt_g_fp32        = torch.cuda.FloatTensor(*norm_shape).fill_(2.0)
-    elementwise_fp32 = torch.cuda.FloatTensor(*dims      ).fill_(0.5)
-
-pt_in_fp16       = pt_in_fp32.half()
-cd_in_prec       = pt_in_fp32.clone()
-pt_g_fp16        = pt_g_fp32.half()
-cd_g_prec        = pt_g_fp32.clone()
-elementwise_fp16 = elementwise_fp32.half()
-elementwise_prec = elementwise_fp32.clone()
-
-if CUDA_HALF:
-    cd_in_prec       = cd_in_prec.half()
-    cd_g_prec        = cd_g_prec.half()
-    elementwise_prec = elementwise_prec.half()
-
-pt_in_fp32 = Variable(pt_in_fp32 , requires_grad=True)
-pt_in_fp16 = Variable(pt_in_fp16 , requires_grad=True)
-cd_in_prec = Variable(cd_in_prec , requires_grad=True)
-
-pt_g_fp32 = Variable(pt_g_fp32 , requires_grad=True)
-pt_g_fp16 = Variable(pt_g_fp16 , requires_grad=True)
-cd_g_prec = Variable(cd_g_prec , requires_grad=True)
-
-elementwise_fp32 = Variable(elementwise_fp32, requires_grad=False)
-elementwise_fp16 = Variable(elementwise_fp16, requires_grad=False)
-elementwise_prec = Variable(elementwise_prec, requires_grad=False)
-
-torch.cuda.nvtx.range_push("fp16 forward, {}".format(pt_in_fp16.size()))
-pt_norms_fp16 = pt_norm(pt_in_fp16, dim)
-pt_out_fp16 = pt_in_fp16*(pt_g_fp16/pt_norms_fp16) 
-torch.cuda.nvtx.range_pop()
-# torch.cuda.synchronize()
-
-torch.cuda.nvtx.range_push("fp32 forward, {}".format(pt_in_fp32.size()))
-pt_norms_fp32 = pt_norm(pt_in_fp32, dim)
-pt_out_fp32 = pt_in_fp32*(pt_g_fp32/pt_norms_fp32)
-torch.cuda.nvtx.range_pop()
-# torch.cuda.synchronize()
-
-# print("pt_norms_fp16    = ", pt_norms_fp16   )
-# print("pt_norms_fp32 = ", pt_norms_fp32)
-
-# print( "cd_in_prec.data_ptr = {:x}".format(cd_in_prec.data_ptr()))
-
-# print("elementwise_fp16 = ", elementwise_fp16)
-
-cd_in_contig = cd_in_prec.contiguous()
-# Deliberately make noncontig to see if fused_norm
-# will handle the error
-# cd_in_contig = cd_in_contig[:,0:5]
-# print(type(cd_in_contig))
-torch.cuda.nvtx.range_push("kernel forward")
-fused_weight_norm = Fused_Weight_Norm.apply
-cd_out_prec = fused_weight_norm(cd_in_contig, cd_g_prec, dim)
-torch.cuda.nvtx.range_pop()
-# torch.cuda.synchronize()
-
-# print("type(cd_out_prec.data) = ", type(cd_out_prec.data))
-# print("cd_out_prec.data_ptr = {:x}".format(cd_out_prec.data_ptr()))
-
-print("\n\n\nCOMPARING FORWARD PASS RESULTS\n\n\n")
-compare(cd_out_prec.data, 
-        pt_out_fp16.data,
-        pt_out_fp32.data,
-        rows)
-
-# It's ok to use elementwise_fp16 as a leaf in both the cuda and pytorch graphs.
-# This sharing should not affect the computed gradients wrt pt_in_fp16 and cd_in_prec.
-# However, just remember:  
-# If we set requires_grad=True for elementwise_fp16, elementwise_fp16.grad.data
-# will accumulate gradients during the backward passes for both the cd and pytorch Losses.
-#
-# I do need    v these parentheses          v             
-Loss_cd_prec = (cd_out_prec*elementwise_prec).sum()
-# print(L_cd_fp16)
-Loss_pt_fp16 = (pt_out_fp16*elementwise_fp16).sum()
-# print(L_pt_fp16)
-Loss_pt_fp32 = (pt_out_fp32*elementwise_fp32).sum()
-# print(L_pt_fp32)
-
-torch.cuda.nvtx.range_push("kernel backward")
-Loss_cd_prec.backward()
-torch.cuda.nvtx.range_pop()
-torch.cuda.nvtx.range_push("fp16 backward")
-Loss_pt_fp16.backward()
-torch.cuda.nvtx.range_pop()
-torch.cuda.nvtx.range_push("fp32 backward")
-Loss_pt_fp32.backward()
-torch.cuda.nvtx.range_pop()
-
-print("\n\n\nCOMPARING v GRADIENT RESULTS\n\n\n")
-compare(cd_in_prec.grad.data, 
-        pt_in_fp16.grad.data, 
-        pt_in_fp32.grad.data, 
-        rows)
-
-print("\n\n\nCOMPARING g GRADIENT RESULTS\n\n\n")
-compare(cd_g_prec.grad.data, 
-        pt_g_fp16.grad.data, 
-        pt_g_fp32.grad.data, 
-        cd_g_prec.size(0))
-
-
--- a/tests/raw_ops/test_backward.py
+++ b/tests/raw_ops/test_backward.py
-import torch
-from torch.autograd import Variable
-import apex_C
-import numpy as np
-from compare import compare
-from norm import pt_norm, get_norm_shape
-
-torch.manual_seed(2)
-torch.cuda.manual_seed(2)
-# torch.cuda.manual_seed_all(2)
-torch.set_printoptions(precision=10)
-
-sizes = [
-    # (3,  512, 1024),
-    # (3,  512, 1536),
-    (3,  768, 1536),
-    # (3,  768, 2048),
-    # (3, 1024, 2048),
-    # (1, 1024, 4096),
-    # (1, 2048, 8192),
-    # (1, 4096, 4096), # this is not one of the fairseq sizes, just a reference benchmark.
-    # (4096, 4096, 1), # this is not one of the fairseq sizes, just a reference benchmark.
-    ]
-
-# rows = 3
-# cols = 512
-# fast = 1024
-HALF = True
-RAND = True
-dim = 2
-
-for rows, cols, fast in sizes:
-    dims = rows, cols, fast
-    # Incoming gradient vectors we will use later
-    # Need to create the fp16 versions as a half() copy of a Tensor first rather than
-    # a Variable, because if you create pt_input_control as a Variable then say
-    # pt_input_fp16 = pt_input_control.half(), you are accidentally making pt_input_fp16 part of 
-    # pLpOutput_control's computational graph, instead of the leaf of its own separate graph.
-    
-    # Careful: if you initialize with torch.ones, the gradient wrt input becomes analytically zero.
-    if RAND:
-        pLpOutput_control = torch.cuda.FloatTensor(*dims      ).uniform_()*1.0 
-        norm_shape = get_norm_shape(pLpOutput_control, dim)
-        pLpg_control      = torch.cuda.FloatTensor(*norm_shape).uniform_()
-        pt_input_control  = torch.cuda.FloatTensor(*dims      ).uniform_()
-        pt_g_control      = torch.cuda.FloatTensor(*norm_shape).uniform_()
-    else:
-        pLpOutput_control = torch.cuda.FloatTensor(*dims      ).fill_(1.)
-        norm_shape = get_norm_shape(pLpOutput_control, dim)
-        pLpg_control      = torch.cuda.FloatTensor(*norm_shape).fill_(2.)
-        pt_input_control  = torch.cuda.FloatTensor(*dims      ).fill_(4.0)
-        pt_g_control      = torch.cuda.FloatTensor(*norm_shape).fill_(3.0)
-    
-    pLpOutput_fp16 = pLpOutput_control.clone()
-    pLpg_fp16      = pLpg_control     .clone()
-    pt_input_fp16  = pt_input_control .clone()
-    pt_g_fp16      = pt_g_control     .clone()
-    
-    if HALF:
-        pLpOutput_fp16 = pLpOutput_fp16.half()
-        pLpg_fp16      = pLpg_fp16     .half() 
-        pt_input_fp16  = pt_input_fp16 .half()
-        pt_g_fp16      = pt_g_fp16     .half()
-    
-    pLpOutput_control = Variable(pLpOutput_control)
-    pLpg_control      = Variable(pLpg_control     )
-    pLpOutput_fp16    = Variable(pLpOutput_fp16   )
-    pLpg_fp16         = Variable(pLpg_fp16        )
-    
-    pt_input_control = Variable(pt_input_control, requires_grad=True)
-    pt_g_control     = Variable(pt_g_control    , requires_grad=True)
-    pt_input_fp16    = Variable(pt_input_fp16   , requires_grad=True)
-    pt_g_fp16        = Variable(pt_g_fp16       , requires_grad=True)
-    
-    # Do forward pass in fp16 and fp32
-    pt_norms_fp16 = pt_norm(pt_input_fp16, dim)
-    pt_norms_control = pt_norm(pt_input_control, dim)
-    
-    pt_output_fp16    = pt_input_fp16   *(pt_g_fp16   /pt_norms_fp16   )
-    pt_output_control = pt_input_control*(pt_g_control/pt_norms_control)
-    
-    # Run the Cuda version
-    pLpInput_cuda = torch.cuda.FloatTensor(*dims      ).fill_(0.)
-    pLpg_cuda     = torch.cuda.FloatTensor(*norm_shape).fill_(0.)
-    
-    if HALF:
-        pLpInput_cuda = pLpInput_cuda.half()
-        pLpg_cuda     = pLpg_cuda    .half()
-   
-    torch.cuda.nvtx.range_push("kernel weight norm backward")
-    apex_C.weight_norm_bwd(pLpInput_cuda, 
-                            pLpg_cuda, 
-                            pLpOutput_fp16,
-                            pt_input_fp16, 
-                            pt_g_fp16,
-                            pt_norms_control.data,
-                            dim)
-    torch.cuda.nvtx.range_pop()
-    
-    print("grad_output:  ", pLpOutput_fp16.data)
-    print(" grad_input:  ", pLpInput_cuda)
-    print(" savedInput:  ", pt_input_fp16.data)
-    print("pt_norms_control:  ", pt_norms_control.data)
-    print("pt_norms_fp16:  ", pt_norms_fp16.data)
-   
-    torch.cuda.nvtx.range_push("pytorch fp16 backward")
-    pt_output_fp16   .backward(gradient=pLpOutput_fp16   , create_graph=True)
-    torch.cuda.nvtx.range_pop()
-    torch.cuda.nvtx.range_push("pytorch fp32 backward")
-    pt_output_control.backward(gradient=pLpOutput_control, create_graph=True)
-    torch.cuda.nvtx.range_pop()
-    
-    # pt_output_fp16 and pt_output_control are still saved, but
-    # pt_output_fp16.grad and pt_output_control.grad are None at this point 
-    # because the graph is freed in the backwards pass.  
-    # Specifying create_/retain_ graph don't seem to force saving of 
-    # either the intermediate variables or their gradients.
-    
-    print("Comparing gradients wrt v")
-    torch.cuda.nvtx.range_push("compare pLpv")
-    compare(pLpInput_cuda, pt_input_fp16.grad.data, pt_input_control.grad.data, rows)
-    torch.cuda.nvtx.range_pop()
-    
-    print("Comparing gradients wrt g")
-    torch.cuda.nvtx.range_push("compare pLpg")
-    compare(pLpg_cuda, pt_g_fp16.grad.data, pt_g_control.grad.data, pLpg_cuda.size(0))
-    torch.cuda.nvtx.range_pop()
-    
-    
--- a/tests/raw_ops/test_forward.py
+++ b/tests/raw_ops/test_forward.py
-import torch
-import sys
-import apex_C
-import numpy as np
-from compare import compare
-from norm import pt_norm, get_norm_shape
-
-
-torch.manual_seed(2)
-torch.cuda.manual_seed(2)
-# torch.cuda.manual_seed_all(2)
-torch.set_printoptions(precision=10)
-
-sizes = [
-    # (3,  512, 1024),
-    # (3,  512, 1536),
-    # (3,  768, 1536),
-    # (3,  768, 2048),
-    # (3, 1024, 2048),
-    # (1, 1024, 4096),
-    # (1, 2048, 8192),
-    # (1, 4096, 4096), # this is not one of the fairseq sizes, just a reference benchmark.
-    (4096, 4096, 1), # this is not one the fairseq sizes, just a reference benchmark.
-    # (353, 55, 353), # this is not one of the fairseq sizes, just a reference benchmark.
-    ]
-
-# rows = 3
-# cols = 512
-# fast = 1024
-HALF = True
-RAND = True
-dim = 0
-    
-
-for rows, cols, fast in sizes:
-    dims = rows, cols, fast
-     
-    print("\n\nTESTING dims = {}\n\n".format(dims))
-
-    if RAND:
-        pt_in = 1.*torch.cuda.FloatTensor(*dims).uniform_()
-        g = torch.cuda.FloatTensor(*get_norm_shape(pt_in, dim)).uniform_()
-    else:
-        pt_in = torch.cuda.FloatTensor(*dims).fill_(1.)
-        g = torch.cuda.FloatTensor(*get_norm_shape(pt_in, dim)).fill_(6.0)
-    
-    # per_col = torch.arange(1,cols+1).cuda()
-    # print((rows*per_col*per_col).sqrt())
-    # pt_in *= per_col
-    
-    cuda_out   =   torch.cuda.FloatTensor(*dims).fill_(0.)
-    cuda_norms =   torch.cuda.FloatTensor(*get_norm_shape(pt_in, dim)).fill_(0.)
-    
-    # Save a copy of the input as float
-    pt_in_fp32 = pt_in.clone()
-    g_fp32     = g.clone()
-    
-    if HALF:
-        pt_in    =    pt_in.half()
-        g        =        g.half()
-        cuda_out = cuda_out.half()
-    
-    apex_C.weight_norm_fwd(cuda_out, cuda_norms, pt_in, g, dim)
-    torch.cuda.synchronize()
-    # quit()
-
-    print("type(cuda_out) = {}\n".format(type(cuda_out)))
-    
-    rownorms      = pt_norm(pt_in, dim)
-    rownorms_fp32 = pt_norm(pt_in_fp32, dim)
-    
-    print("rownorms_fp32:")
-    print(rownorms_fp32)
-    print("cuda_norms"    )
-    print(cuda_norms   )
-    
-    # rownorms is broadcast; torch.div(pt_in, rownorms) and pt_in/rownorms work the same way
-    pt_out         = pt_in*(g/rownorms)
-    pt_out_control = pt_in_fp32*(g_fp32/rownorms_fp32)
-    
-    compare(cuda_out, pt_out, pt_out_control, rows)