Transferred backend and build system to use Pytorch C++ extension + ATen dispatch.

d17a015f · Michael Carilli · e733e78c · d17a015f · e733e78c · e733e78c
Commit d17a015f authored May 25, 2018 by Michael Carilli
18 changed files
--- a/apex/__init__.py
+++ b/apex/__init__.py
-from . import amp
 from . import RNN
 from . import reparameterization
 from . import fp16_utils
 from . import parallel
+from . import amp
--- a/apex/amp/_C/__init__.py
+++ b/apex/amp/_C/__init__.py
--- a/apex/amp/_C/scale_lib/__init__.py
+++ b/apex/amp/_C/scale_lib/__init__.py
-
-from torch.utils.ffi import _wrap_function
-from ._scale_lib import lib as _lib, ffi as _ffi
-
-__all__ = []
-def _import_symbols(locals):
-    for symbol in dir(_lib):
-        fn = getattr(_lib, symbol)
-        if callable(fn):
-            locals[symbol] = _wrap_function(fn, _ffi)
-        else:
-            locals[symbol] = fn
-        __all__.append(symbol)
-
-_import_symbols(locals())
--- a/apex/amp/handle.py
+++ b/apex/amp/handle.py
@@ -4,7 +4,7 @@ import warnings

 import torch

-from ._C import scale_lib
+from apex._C import scale_check_overflow

 class AmpHandle(object):
    def __init__(self, enable_caching=True):
@@ -33,9 +33,9 @@ class AmpHandle(object):
        for group in optimizer.param_groups:
            for p in group['params']:
                if p.grad is not None:
-                    scale_lib.scale_check_overflow(p.grad.data,
-                                                   1. / self._loss_scale,
-                                                   self._overflow_buf)
+                    scale_check_overflow(p.grad.data,
+                                         1. / self._loss_scale,
+                                         self._overflow_buf)
        if self._overflow_buf.any():
            self._loss_scale /= 2.
            optimizer_step = optimizer.step

--- a/apex/amp/src/scale_cuda.c
+++ b/apex/amp/src/scale_cuda.c
-#include <THC/THC.h>
-#include "scale_kernel.h"
-
-extern THCState *state;
-
-void scale_check_overflow(THCudaTensor *grads,
-                          float scale,
-                          THCudaByteTensor *overflow_buf) {
-  size_t num_elems = THCudaTensor_nElement(state, grads);
-  float *d_grads = THCudaTensor_data(state, grads);
-
-  size_t buf_elems = THCudaByteTensor_nElement(state, overflow_buf);
-  uint8_t *d_overflow_buf = THCudaByteTensor_data(state, overflow_buf);
-
-  scale_check_overflow_kernel(state, d_grads, num_elems, scale,
-                              d_overflow_buf, buf_elems);
-}
--- a/apex/amp/src/scale_cuda.h
+++ b/apex/amp/src/scale_cuda.h
-void scale_check_overflow(THCudaTensor *grads,
-                          float scale,
-                          THCudaByteTensor *overflow_buf);
--- a/apex/amp/src/scale_kernel.h
+++ b/apex/amp/src/scale_kernel.h
-#ifndef SCALE_KERNEL_H
-#define SCALE_KERNEL_H
-
-#include <THC/THC.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-void scale_check_overflow_kernel(THCState *state,
-                                 float *d_grads, size_t n, float scale,
-                                 uint8_t *d_buf, size_t buf_n);
-
-#ifdef __cplusplus
-} // extern "C"
-#endif
-
-#endif // SCALE_KERNEL_H
--- a/build_cffi.py
+++ b/build_cffi.py
-# This file contains the cffi-extension call to build the custom
-# kernel used by amp.
-# For mysterious reasons, it needs to live at the top-level directory.
-# TODO: remove this when we move to cpp-extension.
-
-
-import os
-import torch
-from torch.utils.ffi import create_extension
-
-assert torch.cuda.is_available()
-abs_path = os.path.dirname(os.path.realpath(__file__))
-
-sources = ['apex/amp/src/scale_cuda.c']
-headers = ['apex/amp/src/scale_cuda.h']
-defines = [('WITH_CUDA', None)]
-with_cuda = True
-
-extra_objects = [os.path.join(abs_path, 'build/scale_kernel.o')]
-
-# When running `python build_cffi.py` directly, set package=False. But
-# if it's used with `cffi_modules` in setup.py, then set package=True.
-package = (__name__ != '__main__')
-
-extension = create_extension(
-    'apex.amp._C.scale_lib',
-    package=package,
-    headers=headers,
-    sources=sources,
-    define_macros=defines,
-    relative_to=__file__,
-    with_cuda=with_cuda,
-    extra_objects=extra_objects
-)
-
-if __name__ == '__main__':
-    extension.build()
--- a/csrc/Module.cpp
+++ b/csrc/Module.cpp
-#define PY_SSIZE_T_CLEAN
-#define ARG_OFFSET 5
-
-#include <Python.h>
-
-#include <cuda.h>
-#include <cuda_runtime.h>
-
-#include <vector>
-#include <string>
-#include <cmath>
-#include <cassert>
-#include <iostream>
-
-// #define USE_NVTX
-#ifdef USE_NVTX
-#include "nvToolsExt.h"
-#endif
-
-//Meta-data format we will use
-#include <THCTensorInfo.cuh>
-
-//Cuda kernels
-#include <kernel.h>
-
-#define ERROR_MSG cout << "Error at " << __FILE__ << ":" << __LINE__ << "\n";
-
-using namespace std;
-
-TensorInfo<void, idxType> PyOb_2_tinfo(PyObject* tensor, float_types data_type)
-{
-  PyObject* PyStrides = PyObject_CallMethod(tensor, "stride", NULL);
-  if(PyStrides == NULL)
-  {
-    ERROR_MSG;
-    cout << "PyStrides = NULL" << endl;
-  }
-
-  PyObject* PySizes = PyObject_CallMethod(tensor, "size", NULL);
-  if(PySizes == NULL)
-  {
-    ERROR_MSG;
-    cout << "PySizes = NULL" << endl;
-  }
-
-  PyObject* PyDataPtr = PyObject_CallMethod(tensor, "data_ptr", NULL);
-  if(PyDataPtr == NULL)
-  {
-    ERROR_MSG;
-    cout << "PyDataPtr = NULL" << endl;
-  }
-
-  void* data_ptr = (void*) PyLong_AsLong(PyDataPtr);
-  Py_ssize_t ndims = PyList_GET_SIZE(PySizes);
-
-  // TODO put proper checking on ndims < MAX_CUTORCH_DIMS
-  idxType strides[MAX_CUTORCH_DIMS], sizes[MAX_CUTORCH_DIMS];
-
-  for(int i = 0; i < ndims; i++)
-  {
-    strides[i] = PyLong_AsLong(PyTuple_GetItem(PyStrides, i));
-    sizes[i] = PyLong_AsLong(PyTuple_GetItem(PySizes, i));
-  }
-
-  Py_DECREF(PyStrides);
-  Py_DECREF(PySizes);
-  Py_DECREF(PyDataPtr);
-
-  return TensorInfo<void, idxType>(data_ptr, ndims, sizes, strides, data_type);
-}
-
-vector<TensorInfo<void, idxType> > get_TInfos(PyObject* args)
-{
-  vector<TensorInfo<void, idxType> > info_vec;
-#ifdef DEBUG_ANY 
-  cout << "Processing " << PyTuple_GET_SIZE(args) << " arguments" << endl;
-#endif
-
-#ifdef CHECK_MEMLEAK
-  for(int iter = 0; iter < 1e7; iter++ )
-#endif
-    for(Py_ssize_t i = 0; i<PyTuple_GET_SIZE(args) - 1; i++)
-    {
-      PyObject* pyTensor = PyTuple_GetItem(args, i);
-
-      // check type, only take if Tensor, Variable, or Parameter
-      string objType(pyTensor->ob_type->tp_name);
-
-      PyObject* pyObjTypeCall = PyObject_CallMethod(pyTensor, "type", NULL);
-      if(pyObjTypeCall == NULL)
-      {
-	ERROR_MSG;
-	cout << "For args item " << i << ", pyObjTypeCall = NULL" << endl;
-      }
-
-      PyObject* pyObjASCII = PyUnicode_AsASCIIString(pyObjTypeCall);
-      if(pyObjASCII == NULL)
-      {
-	ERROR_MSG;
-	cout << "For args item " << i << ", pyObjASCII = NULL " << endl;
-      }
-
-      Py_DECREF(pyObjTypeCall);
-
-      string objTypeCall(PyBytes_AsString(pyObjASCII));
-
-      Py_DECREF(pyObjASCII);
-
-#ifdef DEBUG_ANY
-      cout << "arg " << i << endl;
-      cout << "objType = " << objType << endl;
-      cout << "objTypeCall = " << objTypeCall << endl;
-#endif
-
-      if(objTypeCall == "torch.cuda.FloatTensor")
-#ifdef CHECK_MEMLEAK
-	if(iter == 0 )
-#endif
-	  info_vec.push_back(PyOb_2_tinfo(pyTensor, FLOAT));
-#ifdef CHECK_MEMLEAK
-	else
-	  info_vec[i] = PyOb_2_tinfo(pyTensor, FLOAT);
-#endif
-      else if(objTypeCall == "torch.cuda.HalfTensor")
-	info_vec.push_back(PyOb_2_tinfo(pyTensor, HALF));
-      // TODO add double
-      else
-      {
-	ERROR_MSG;
-	cout << "For args item " << i << ", unsupported .type() found: "
-	     << objTypeCall << "\n"
-		"Supported types:\n"
-		"torch.cuda.FloatTensor\n"
-		"torch.cuda.HalfTensor\n"
-		"torch.autograd.variable.Variable containing FloatTensor\n"
-		"torch.autograd.variable.Variable containing HalfTensor\n"
-		"torch.nn.parameter.Parameter containing FloatTensor\n" 
-		"torch.nn.parameter.Parameter containing HalfTensor\n" 
-	     << endl;
-      }
-    }
-
-  return info_vec;
-}
-
-int getLastArg_AsInt(PyObject* args)
-{
-  // None of these should return new references so I don't think this leaks memory.
-  int dims = PyLong_AsLong(PyTuple_GetItem(args, PyTuple_GET_SIZE(args) - 1));
-  return dims;
-}
-
-// Stepping stone, can evolve to be more general (argument forwarding?)
-template<typename wrapper>
-void dispatch
-(
-  float_types rtti, 
-  vector<TensorInfo<void, idxType>>& tensors, 
-  int dim
-)
-{
-  switch(rtti)
-  {
-    case FLOAT:
-      wrapper::template call<float, float, idxType>(tensors, dim);
-      break;
-    case HALF:
-      wrapper::template call<half, float, idxType>(tensors, dim);
-      break;
-    default:
-      std::cout << "Unsupported rtti in Module.cpp:dispatch()" << std::endl;
-      PyErr_SetString(PyExc_RuntimeError, "Unsupported data type in Module.cpp:dispatch, "
-                                          "supported data types are half and float");
-      exit(-1);
-  }
-}
-
-// Will extract all tensors in order. Assumes flat structure, tensors can not be wrapped in lists
-// tuples or any other iterator structure.
-static PyObject* weight_norm_fwd(PyObject* self, PyObject* args)
-{
-#ifdef USE_NVTX
-nvtxRangePushA("weight_norm_fwd C backend");
-#endif
-
-  vector<TensorInfo<void, idxType> > tensors = get_TInfos(args);
-  int dim = getLastArg_AsInt(args);
-
-  if(dim != 0 && dim != tensors[2].dims - 1)
-    PyErr_SetString(PyExc_RuntimeError, "weight_norm_fwd currently only "
-                                        "supports first or last dimension.");
-  else
-  {
-#ifdef DEBUG_ANY
-    cout << "tensors.size() = " << tensors.size() << ", dim = " << dim << endl;
-#endif
- 
-    dispatch<send_to_fwd_wrapper>(tensors[0].type, tensors, dim);
-
-#ifdef USE_NVTX
-    nvtxRangePop();
-#endif
-  }
-
-  Py_RETURN_NONE;
-}
-
-static PyObject* weight_norm_bwd(PyObject* self, PyObject* args)
-{
-#ifdef USE_NVTX
-  nvtxRangePushA("weight_norm_bwd C backend");
-#endif
-
-  vector<TensorInfo<void, idxType> >tensors = get_TInfos(args);
-  int dim = getLastArg_AsInt(args);
-
-  if(dim != 0 && dim != tensors[3].dims - 1)
-    PyErr_SetString(PyExc_RuntimeError, "weight_norm_bwd currently only "
-                                        "supports first or last dimension.");
-  else
-  {
-#ifdef DEBUG_ANY
-    cout << "tensors.size() = " << tensors.size() << ", dim = " << dim << endl;
-#endif
-
-    dispatch<send_to_bwd_wrapper>(tensors[0].type, tensors, dim);
-
-#ifdef USE_NVTX
-    nvtxRangePop();
-#endif
-  }
-
-  Py_RETURN_NONE;
-}
-
-//*******************PYTHON BOILER PLATE*******************
-static PyMethodDef apex_methods[] = {
-  {"weight_norm_fwd", (PyCFunction) weight_norm_fwd, METH_VARARGS, "Slowest-dim norm, forward pass."},
-  {"weight_norm_bwd", (PyCFunction) weight_norm_bwd, METH_VARARGS, "Slowest-dim norm, backward pass."},
-  {NULL, NULL, 0, NULL}
-};
-
-#if PY_MAJOR_VERSION >= 3
-
-//Module Definitions
-static struct PyModuleDef apex = {
-  PyModuleDef_HEAD_INIT, "apex._C", "Module to add CUDA extensions to Pytorch.", -1, apex_methods
-};
-//Initialization Function
-PyMODINIT_FUNC PyInit__C(void){
-
-  //Let's throw an error if we can't find pytorch.
-  PyImport_ImportModule("torch");
-  Py_Initialize();
-  return PyModule_Create(&apex);
-}
-#else
-PyMODINIT_FUNC initMODULE(void){
-  //Let's throw an error if we can't find pytorch.
-  PyImport_ImportModule("torch");
-  (void) Py_InitModule3("apex._C", apex, "A PyTorch Extension.");
-}
-
-#endif
-//*********************************************************
-
--- a/csrc/interface.cpp
+++ b/csrc/interface.cpp
+#include <torch/torch.h>
+
+// Ideally, I'd like to call this file "weight_norm.cu" and put the interface and the implementation
+// here, but I can't make nvcc play well with torch.h.  For now, use a layer of indirection 
+// and separate .cu implementation files.
+
+// If we want everything to be part of "apex._C", we need all the interface functions defined 
+// in this file, or linker will complain about "multiple definitions of PyInit".
+// TODO:  multiple modules?
+
+// TODO:  modify fwd+bwd calls to return a tuple of Tensors.  This will require changing the
+// Python client code as well.  For now, get things working with the same Python-side API.
+
+void weight_norm_fwd_cuda
+  (const at::Tensor& w,
+   const at::Tensor& norms,
+   const at::Tensor& v,
+   const at::Tensor& g,
+   int dim);
+
+void weight_norm_fwd
+  (at::Tensor w, 
+   at::Tensor norms, 
+   at::Tensor v, 
+   at::Tensor g, 
+   int dim) 
+{
+  weight_norm_fwd_cuda(w, norms, v, g, dim);
+}
+
+void weight_norm_bwd_cuda
+  (const at::Tensor& pLpv,
+   const at::Tensor& pLpg,
+   const at::Tensor& pLpw,
+   const at::Tensor& savedv,
+   const at::Tensor& savedg,
+   const at::Tensor& savedNorms,
+   int dim);
+
+void weight_norm_bwd
+  (at::Tensor pLpv, 
+   at::Tensor pLpg, 
+   at::Tensor pLpw, 
+   at::Tensor savedv, 
+   at::Tensor savedg,
+   at::Tensor savedNorms,
+   int dim)
+{
+  weight_norm_bwd_cuda(pLpv, pLpg, pLpw, savedv, savedg, savedNorms, dim);
+}
+
+void scale_check_overflow_cuda
+  (const at::Tensor& d_grads, 
+   float scale,
+   const at::Tensor& d_buf);
+
+void scale_check_overflow
+  (at::Tensor grads,
+   float scale,
+   at::Tensor overflow_buf)
+{ 
+  AT_CHECK(grads.type().is_cuda(), "x must be a CUDA tensor");
+  AT_CHECK(overflow_buf.type().is_cuda(), "y must be a CUDA tensor");
+  scale_check_overflow_cuda(grads, scale, overflow_buf);
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("weight_norm_fwd", &weight_norm_fwd, "Fused weight norm, forward pass");
+  m.def("weight_norm_bwd", &weight_norm_bwd, "Fused weight norm, backward pass");
+  m.def("scale_check_overflow", &scale_check_overflow, "Fused overflow check + scale for FP32 tensors");
+}
--- a/csrc/kernel.cu
+++ b/csrc/kernel.cu
-#include "../include/kernel.h"
-
-template<typename T> struct TtoInt { static const int test = -1; };
-template<> struct TtoInt<float> { static const int test = 0; }; 
-template<> struct TtoInt<half> { static const int test = 0; }; 
-template<> struct TtoInt<double> { static const int test = 0; }; 
-
-#if __CUDACC_VER_MAJOR__ >= 9
-#define __SHFL_DOWN(var, delta)  __shfl_down_sync(0xffffffff, var, delta)
-#else
-#define __SHFL_DOWN(var, delta)  __shfl_down(var, delta)
-#endif
-
-#if __CUDACC_VER_MAJOR__ >= 9
-#define __SYNCWARP __syncwarp()
-#else
-#define __SYNCWARP 
-#endif
-
-// Block size for weight_norm_*_first_dim_kernel.
-// Currently, kernels are non-persistent.
-// Dialing up the block size to, say 1024, can improve performance by
-// increase the amount of cache available per block, which can improve cache hit rate.
-// However, this is less efficient for short rows.  256 is pretty versatile. 
-// May be worth implementing heuristics later.
-#define BLOCK 256
-
-// Block size for weight_norm_*_last_dim_kernel.
-// This is tricker than the first_dim case because we must make blocks 
-// at least 16 fast elements wide to ensure fully-coalesced half-precision accesses.
-// Since output-element parallelism is along the fast dimension, this reduces the number of 
-// blocks we can launch by 16X.  
-#define TILE_W 16
-// Somewhat versatile strategy: max out intra-block parallelism by extending
-// blocks across the slow dimension up to the hardware-max block size of 1024.
-#define TILE_H 64
-
-using namespace std;
-
-// lanes is intended to be <= 32.
-template <typename T>
-__device__ __forceinline__ void reduce_block_into_lanes(T *x, T val, int lanes) 
-{ 
-  int tid = threadIdx.x + threadIdx.y*blockDim.x;
-  int blockSize = blockDim.x*blockDim.y;
-
-  if(blockSize >= 64)
-  {
-    x[tid] = val;
-    __syncthreads();
-  }
-  
-  #pragma unroll
-  for(int i = (blockSize >> 1); i >= 64; i >>= 1) 
-  {
-    if(tid < i)
-      x[tid] += x[tid+i]; // JoinOp
-    __syncthreads();
-  }
-
-  if(tid < 32) 
-  {
-    T final;
-    if(blockSize >= 64)
-      final = x[tid] + x[tid+32]; // JoinOp
-    else
-      final = val;
-    // __SYNCWARP();
-
-    #pragma unroll
-    for(int i = 16; i >= lanes; i >>= 1)
-      final += __SHFL_DOWN(final, i);
-
-    if(tid < lanes) 
-      x[tid] = final; // EpilogueOp
-  }
-
-  // Make sure the smem result is visible to all warps.
-  __syncthreads();
-}
-
-template <typename T, typename IndexType>
-__global__ void weight_norm_fwd_first_dim_kernel
-(
-  TensorInfo<T, IndexType> w,
-  TensorInfo<float, IndexType> norms,
-  TensorInfo<T, IndexType> v,
-  TensorInfo<T, IndexType> g,
-  IndexType rowSize
-)
-{
-  // We are norming each slowest-dim row of the tensor separately.
-  // For now, assign one block to each row.
-  IndexType tid = threadIdx.x;
-  IndexType row = blockIdx.x;
-  IndexType stride = blockDim.x;
-
-  // Logical index offset for this flattened row
-  IndexType rowStart = row*rowSize;
-
-  extern __shared__ float s[];
-  
-  float thread_sum = 0.f;
-  for(IndexType i = tid; i < rowSize; i += stride ) 
-  {
-    float val_f = ScalarConvert<T, float>::to(DEVICE_LINEAR_GET(v, i + rowStart)); 
-    thread_sum += val_f*val_f; // AccumOp, could do Kahan here
-  }
-
-  reduce_block_into_lanes(s, thread_sum, 1);
-  float result = s[0];
-
-  result = sqrtf(result);
-  
-  if(tid == 0)
-    DEVICE_LINEAR_GET_F(norms, row) = result;
-
-  // Broadcast load, could use shared memory instead.
-  float g_this_row = ScalarConvert<T, float>::to(DEVICE_LINEAR_GET(g, row));
-
-  float rnorm = 1.f/result; // for consistency with backward kernel
-
-  // Write data to output
-  for(IndexType i = tid; i < rowSize; i += stride ) 
-  {
-    float val_f = ScalarConvert<T, float>::to(DEVICE_LINEAR_GET(v, i + rowStart));
-    DEVICE_LINEAR_GET(w, i + rowStart) = ScalarConvert<float,T>::to(g_this_row*val_f*rnorm);
-  }
-}
-
-template <typename T, typename IndexType>
-__global__ void weight_norm_fwd_last_dim_kernel
-(
-  TensorInfo<T, IndexType> w,
-  TensorInfo<float, IndexType> norms,
-  TensorInfo<T, IndexType> v,
-  TensorInfo<T, IndexType> g,
-  IndexType fast_dim_size,
-  IndexType slower_dims_size
-)
-{
-  IndexType fast_dim_location = threadIdx.x + blockIdx.x*blockDim.x;
-
-  extern __shared__ float alloc[];
-  float* s = &alloc[0];
-  float* rnorms_this_block = &alloc[blockDim.x*blockDim.y];
-
-  float thread_sum = 0.f;
-
-  IndexType slower_dims_location = threadIdx.y;
-  IndexType currentIdx = fast_dim_location + fast_dim_size*slower_dims_location;
-  if(fast_dim_location < fast_dim_size)
-    while(slower_dims_location < slower_dims_size)
-    {
-      float val_f = ScalarConvert<T, float>::to(DEVICE_LINEAR_GET(v, currentIdx)); 
-      thread_sum += val_f*val_f; // AccumOp, could do Kahan here
-      currentIdx += blockDim.y*fast_dim_size;
-      slower_dims_location += blockDim.y; 
-    }
-
-  reduce_block_into_lanes(s, thread_sum, blockDim.x); 
-
-  // Better to pass an EpilogueOp to reduce_block_into_lanes, implement later
-  if(threadIdx.y == 0)
-  {
-    float result = s[threadIdx.x];
-    float norm_this_col = sqrtf(result);
-    DEVICE_LINEAR_GET_F(norms, fast_dim_location) = norm_this_col;
-    rnorms_this_block[threadIdx.x] = 1.f/norm_this_col;
-  }
-   
-  __syncthreads(); 
-
-  float g_this_col = ScalarConvert<T, float>::to(DEVICE_LINEAR_GET(g, fast_dim_location));     
-
-  float rnorm = rnorms_this_block[threadIdx.x]; 
-
-  slower_dims_location = threadIdx.y;
-  currentIdx = fast_dim_location + fast_dim_size*slower_dims_location;
-  if(fast_dim_location < fast_dim_size)
-    while(slower_dims_location < slower_dims_size)
-    {
-      float val_f = ScalarConvert<T, float>::to(DEVICE_LINEAR_GET(v, currentIdx)); 
-      DEVICE_LINEAR_GET(w, currentIdx) = ScalarConvert<float,T>::to(g_this_col*val_f*rnorm);
-      currentIdx += blockDim.y*fast_dim_size;
-      slower_dims_location += blockDim.y; 
-    } 
-}
-
-template <typename T, typename IndexType>
-__global__ void weight_norm_bwd_first_dim_kernel
-(
-  TensorInfo<T, IndexType> pLpv,
-  TensorInfo<T, IndexType> pLpg,
-  TensorInfo<T, IndexType> pLpw,
-  TensorInfo<T, IndexType> savedv,
-  TensorInfo<T, IndexType> savedg,
-  TensorInfo<float, IndexType> savedNorms,
-  IndexType rowSize
-)
-{
-  // For now, assign one block to each row.
-  IndexType tid = threadIdx.x;
-  IndexType row = blockIdx.x;
-  IndexType stride = blockDim.x;
-
-  // Logical index offset for this flattened row
-  IndexType rowStart = row*rowSize;
-
-  extern __shared__ float s[];
-  
-  float thread_sum = 0.f;
-  for(IndexType i = tid; i < rowSize; i += stride ) 
-  {
-    float pLpwi = ScalarConvert<T, float>::to(DEVICE_LINEAR_GET(pLpw, i + rowStart)); 
-    float savedvi = ScalarConvert<T, float>::to(DEVICE_LINEAR_GET(savedv, i + rowStart)); 
-    thread_sum += pLpwi*savedvi; // AccumOp, could do Kahan here
-  }
-
-  reduce_block_into_lanes(s, thread_sum, 1);
-  float result = s[0];
-
-  // Could choose to save reciprocal of norm instead I suppose, but norms is probably
-  // more handy to keep around.
-  // Broadcast load; could use shared memory instead.
-  float rnorm = 1.f/DEVICE_LINEAR_GET_F(savedNorms, row);  
-  float rnorm3 = rnorm*rnorm*rnorm;
-
-  // Write g gradients.
-  if(tid == 0)
-    DEVICE_LINEAR_GET(pLpg, row) = ScalarConvert<float, T>::to(result*rnorm);
-
-  // Broadcast load, could use shared memory instead.
-  float g_this_row = ScalarConvert<T, float>::to(DEVICE_LINEAR_GET(savedg, row));
-   
-  // Write v gradients.  We are reusing values that were loaded earlier, so there 
-  // is an optimization opportunity here (store values persistently).
-  for(IndexType j = tid; j < rowSize; j += stride ) 
-  {
-    float pLpwj = ScalarConvert<T, float>::to(DEVICE_LINEAR_GET(pLpw, j + rowStart));  
-    float savedvj = ScalarConvert<T, float>::to(DEVICE_LINEAR_GET(savedv, j + rowStart));  
-    float pLpvj = g_this_row*(rnorm*pLpwj - rnorm3*savedvj*result);
-    DEVICE_LINEAR_GET(pLpv, j + rowStart) = ScalarConvert<float,T>::to(pLpvj);
-  }
-}
-
-template <typename T, typename IndexType>
-__global__ void weight_norm_bwd_last_dim_kernel
-(
-  TensorInfo<T, IndexType> pLpv,
-  TensorInfo<T, IndexType> pLpg,
-  TensorInfo<T, IndexType> pLpw,
-  TensorInfo<T, IndexType> savedv,
-  TensorInfo<T, IndexType> savedg,
-  TensorInfo<float, IndexType> savedNorms,
-  IndexType fast_dim_size,
-  IndexType slower_dims_size
-)
-{
-  IndexType fast_dim_location = threadIdx.x + blockIdx.x*blockDim.x;
-
-  extern __shared__ float s[];
-
-  float thread_sum = 0.f;
-
-  IndexType slower_dims_location = threadIdx.y;
-  IndexType currentIdx = fast_dim_location + fast_dim_size*slower_dims_location;
-  if(fast_dim_location < fast_dim_size)
-    while(slower_dims_location < slower_dims_size)
-    {
-      float pLpwi = ScalarConvert<T, float>::to(DEVICE_LINEAR_GET(pLpw, currentIdx)); 
-      float savedvi = ScalarConvert<T, float>::to(DEVICE_LINEAR_GET(savedv, currentIdx)); 
-      thread_sum += pLpwi*savedvi; // AccumOp, could do Kahan here
-      currentIdx += blockDim.y*fast_dim_size;
-      slower_dims_location += blockDim.y; 
-    }
-
-  reduce_block_into_lanes(s, thread_sum, blockDim.x); 
-  float result = s[threadIdx.x];
-
-  // Broadcast load; could use shared memory instead.
-  float rnorm = 1.f/DEVICE_LINEAR_GET_F(savedNorms, fast_dim_location);  
-  float rnorm3 = rnorm*rnorm*rnorm;
-
-  // Write g gradients.
-  if(threadIdx.y == 0)
-    DEVICE_LINEAR_GET(pLpg, fast_dim_location) = ScalarConvert<float, T>::to(result*rnorm);
-
-  // Entire block pulls these values, could use shared memory instead.
-  float g_this_col = ScalarConvert<T, float>::to(DEVICE_LINEAR_GET(savedg, fast_dim_location));
-
-  // Write v gradients.
-  slower_dims_location = threadIdx.y;
-  currentIdx = fast_dim_location + fast_dim_size*slower_dims_location;
-  if(fast_dim_location < fast_dim_size)
-    while(slower_dims_location < slower_dims_size)
-    {
-      float pLpwj = ScalarConvert<T, float>::to(DEVICE_LINEAR_GET(pLpw, currentIdx));  
-      float savedvj = ScalarConvert<T, float>::to(DEVICE_LINEAR_GET(savedv, currentIdx));  
-      float pLpvj = g_this_col*(rnorm*pLpwj - rnorm3*savedvj*result);
-      DEVICE_LINEAR_GET(pLpv, currentIdx) = ScalarConvert<float,T>::to(pLpvj);
-      currentIdx += blockDim.y*fast_dim_size;
-      slower_dims_location += blockDim.y; 
-    } 
-}
-
-template<typename DataType, 
-         typename AccumType,
-         typename IndexType>
-void send_to_fwd_wrapper::call
-(
-  vector<TensorInfo<void, idxType>>& tensors,
-  int dim
-)
-{
-#ifdef DEBUG_ANY
-  cout << "hello from send_to_fwd with v.type = " << v.type << endl;
-#endif
-
-  auto w    (*((TensorInfo<DataType , idxType>*)&tensors[0]));
-  auto norms(*((TensorInfo<AccumType, idxType>*)&tensors[1]));
-  auto v    (*((TensorInfo<DataType , idxType>*)&tensors[2]));
-  auto g    (*((TensorInfo<DataType , idxType>*)&tensors[3]));
-
-  if(dim == 0) 
-  {
-    // Find logical size of each flattened slowest-dim row
-    IndexType rowSize = 1;
-    for(IndexType i = v.dims - 1; i > 0; i--)
-      rowSize *= v.sizes[i];
-
-    weight_norm_fwd_first_dim_kernel<<<v.sizes[0], BLOCK, BLOCK*sizeof(float)>>>
-    (
-      w, 
-      norms,
-      v,  
-      g,  
-      rowSize
-    );
-  }
-  else if(dim == v.dims - 1)
-  {
-    // Precompute slower_dims_size and fast_dim_size because they involve dynamically indexing an array.
-    IndexType slower_dims_size = 1;
-    for(IndexType i = 0; i < v.dims - 1; i++)
-      slower_dims_size *= v.sizes[i];
-
-    int fast_dim_size = v.sizes[v.dims-1];
-
-    weight_norm_fwd_last_dim_kernel<<<(fast_dim_size+TILE_W-1)/TILE_W,
-                                       dim3(TILE_W,TILE_H), 
-                                       (TILE_W*TILE_H + TILE_W)*sizeof(float)>>>
-    (
-      w, 
-      norms,
-      v,  
-      g,  
-      fast_dim_size,
-      slower_dims_size 
-    );
-  }
-  // else 
-  // {
-  //   intermediate dim kernel.  Error checking on the dim was already done in 
-  //   Module.cpp:weight_norm_fwd.  Could put that logic here instead, if we include
-  //   <python.h> in both files.
-  // }
-
-#ifdef DEBUG_PROFILE
-  cudaDeviceSynchronize();
-#endif
-}
-
-template<typename DataType,
-         typename AccumType,
-         typename IndexType>
-void send_to_bwd_wrapper::call
-(
-  vector<TensorInfo<void, idxType>>& tensors,
-  int dim
-)
-{
-#ifdef DEBUG_ANY
-  cout << "Hello from send_to_bwd with pLpw.type = " << pLpw.type << endl;
-#endif
-
-  auto pLpv      (*((TensorInfo<DataType , idxType>*)&tensors[0]));
-  auto pLpg      (*((TensorInfo<DataType , idxType>*)&tensors[1]));
-  auto pLpw      (*((TensorInfo<DataType , idxType>*)&tensors[2]));
-  auto savedv    (*((TensorInfo<DataType , idxType>*)&tensors[3]));
-  auto savedg    (*((TensorInfo<DataType , idxType>*)&tensors[4]));
-  auto savedNorms(*((TensorInfo<AccumType, idxType>*)&tensors[5]));
-
-  if(dim == 0) 
-  {
-    // Find logical size of each flattened slowest-dim row
-    IndexType rowSize = 1;
-    for(IndexType i = savedv.dims - 1; i > 0; i--)
-      rowSize *= savedv.sizes[i];
-
-    weight_norm_bwd_first_dim_kernel<<<pLpw.sizes[0], BLOCK, BLOCK*sizeof(float)>>>
-    (
-      pLpv,
-      pLpg,
-      pLpw,
-      savedv,
-      savedg,
-      savedNorms,
-      rowSize
-    );
-  }
-  else if(dim == savedv.dims - 1)
-  {
-    // Precompute slower_dims_size and fast_dim_size because they involve dynamically indexing an array.
-    IndexType slower_dims_size = 1;
-    for(IndexType i = 0; i < savedv.dims - 1; i++)
-      slower_dims_size *= savedv.sizes[i];
-
-    int fast_dim_size = savedv.sizes[savedv.dims-1];
-
-    weight_norm_bwd_last_dim_kernel<<<(fast_dim_size+TILE_W-1)/TILE_W,
-                                       dim3(TILE_W,TILE_H), 
-                                       (TILE_W*TILE_H + TILE_W)*sizeof(float)>>>
-    (
-      pLpv,
-      pLpg,
-      pLpw,
-      savedv,
-      savedg,
-      savedNorms,
-      fast_dim_size,
-      slower_dims_size 
-    );
-  }
-  // else 
-  // {
-  //   intermediate dim kernel.  Error checking on the dim was already done in 
-  //   Module.cpp:weight_norm_bwd.  Could put that logic here instead, if we include
-  //   <python.h> in both files.
-  // }
-
-#ifdef DEBUG_PROFILE
-  cudaDeviceSynchronize();
-#endif
-}
-
-#define INSTANTIATE_SEND_TO_FWD(DATATYPE, ACCUMTYPE, IDXTYPE)         \
-template void send_to_fwd_wrapper::call<DATATYPE, ACCUMTYPE, IDXTYPE> \
-(                                                                     \
-  vector<TensorInfo<void, idxType>>&,                                 \
-  int                                                                 \
-);
-INSTANTIATE_SEND_TO_FWD(float, float, idxType)
-INSTANTIATE_SEND_TO_FWD(half, float, idxType)
-#undef INSTANTIATE_SEND_TO_FWD
-
-#define INSTANTIATE_SEND_TO_BWD(DATATYPE, ACCUMTYPE, IDXTYPE)         \
-template void send_to_bwd_wrapper::call<DATATYPE, ACCUMTYPE, IDXTYPE> \
-(                                                                     \
-  vector<TensorInfo<void, idxType>>&,                                 \
-  int                                                                 \
-);                                                            
-INSTANTIATE_SEND_TO_BWD(float, float, idxType)
-INSTANTIATE_SEND_TO_BWD(half, float, idxType)
-#undef INSTANTIATE_SEND_TO_BWD
-
-#undef BLOCK
-#undef TILE_W
-#undef TILE_H
--- a/csrc/kernel_utils.cuh
+++ b/csrc/kernel_utils.cuh
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#if __CUDACC_VER_MAJOR__ >= 9
+#define __SHFL_DOWN(var, delta)  __shfl_down_sync(0xffffffff, var, delta)
+#else
+#define __SHFL_DOWN(var, delta)  __shfl_down(var, delta)
+#endif
+
+#if __CUDACC_VER_MAJOR__ >= 9
+#define __SYNCWARP __syncwarp()
+#else
+#define __SYNCWARP 
+#endif
+
+// Block size for weight_norm_*_first_dim_kernel.
+// Currently, kernels are non-persistent.
+// Dialing up the block size to, say 1024, can improve performance by
+// increase the amount of cache available per block, which can improve cache hit rate.
+// However, this is less efficient for short rows.  256 is pretty versatile. 
+// May be worth implementing heuristics later.
+#define BLOCK 256
+
+// Block size for weight_norm_*_last_dim_kernel.
+// This is tricker than the first_dim case because we must make blocks 
+// at least 16 fast elements wide to ensure fully-coalesced half-precision accesses.
+// Since output-element parallelism is along the fast dimension, this reduces the number of 
+// blocks we can launch by 16X.  
+#define TILE_W 16
+// Somewhat versatile strategy: max out intra-block parallelism by extending
+// blocks across the slow dimension up to the hardware-max block size of 1024.
+#define TILE_H 64
+
+// For reference, in THCTensorMathReduce.cuh:
+// template <typename T>
+// struct ReduceAdd {
+//   inline __device__ T operator()(const T a, const T b) const {
+//     return THCNumerics<T>::add(a, b);
+//   }
+// };
+
+// lanes is intended to be <= 32.
+template 
+  <typename T, 
+   typename ReduceOp>
+__device__ __forceinline__ void reduce_block_into_lanes
+  (T *x, 
+   T val, 
+   int lanes,
+   ReduceOp reduceOp) 
+{ 
+  int tid = threadIdx.x + threadIdx.y*blockDim.x;
+  int blockSize = blockDim.x*blockDim.y;
+
+  if(blockSize >= 64)
+  {
+    x[tid] = val;
+    __syncthreads();
+  }
+  
+  #pragma unroll
+  for(int i = (blockSize >> 1); i >= 64; i >>= 1) 
+  {
+    if(tid < i)
+      x[tid] = reduceOp(x[tid], x[tid+i]);
+    __syncthreads();
+  }
+
+  if(tid < 32) 
+  {
+    T final;
+    if(blockSize >= 64)
+      final = reduceOp(x[tid], x[tid+32]);
+    else
+      final = val;
+    // __SYNCWARP();
+
+    #pragma unroll
+    for(int i = 16; i >= lanes; i >>= 1)
+      final = reduceOp(final, __SHFL_DOWN(final, i));
+
+    if(tid < lanes) 
+      x[tid] = final; // EpilogueOp
+  }
+
+  // Make sure the smem result is visible to all warps.
+  __syncthreads();
+}
--- a/csrc/scale_kernel.cu
+++ b/csrc/scale_kernel.cu
-#include "scale_kernel.h"
+
+#include <ATen/ATen.h>
+#include "ATen/AccumulateType.h"
+#include "ATen/cuda/CUDATensorMethods.cuh"
+#include "ATen/cuda/CUDATypeConversion.cuh"
+#include <THC/THCTensorMathReduce.cuh>
+
 #include <assert.h>

 #define BLOCK_SIZE 1024
 #define MAX_BLOCKS 1024

-#ifdef __cplusplus
-extern "C" {
-#endif
-
-__global__
-void scale_reduce_overflow(float *in, size_t n, float scale,
-                           uint8_t *overflow_out) {
+// It makes sense to lock the type to "float" here because the downscaling
+// should only be applied to the FP32 master gradients.  Also, if "in" were 
+// a different type, it would require divergent code for the vectorized load logic.
+__global__ void scale_reduce_overflow
+  (float *in, 
+   size_t n, 
+   float scale,
+   uint8_t *overflow_out) 
+{
    __shared__ uint8_t cta_overflow[BLOCK_SIZE];

    int tid = blockIdx.x * blockDim.x + threadIdx.x;
@@ -71,17 +79,25 @@ void scale_reduce_overflow(float *in, size_t n, float scale,
    }
 }

-void scale_check_overflow_kernel(THCState *state,
-                                 float *d_grads, size_t n, float scale,
-                                 uint8_t *d_buf, size_t buf_n) {
-    int num_blks = min((int(n) + BLOCK_SIZE - 1) / BLOCK_SIZE,
-                       MAX_BLOCKS);
-    assert(buf_n >= num_blks);
-    cudaStream_t cur_stream = THCState_getCurrentStream(state);
-    scale_reduce_overflow<<<num_blks, BLOCK_SIZE, 0, cur_stream>>>(
-      d_grads, n, scale, d_buf);
+void scale_check_overflow_cuda
+  (const at::Tensor& d_grads, 
+   float scale,
+   const at::Tensor& d_buf) 
+{
+  using namespace at;
+  cudaStream_t stream = globalContext().getCurrentCUDAStream();
+  
+  size_t n = d_grads.numel();
+  size_t buf_n = d_buf.numel();
+
+  int num_blks = min((int(n) + BLOCK_SIZE - 1) / BLOCK_SIZE,
+                     MAX_BLOCKS);
+  assert(buf_n >= num_blks);
+  scale_reduce_overflow<<<num_blks, BLOCK_SIZE, 0, stream>>>
+    (d_grads.data<float>(), 
+     n, 
+     scale, 
+     d_buf.data<uint8_t>());
+  THCudaCheck(cudaGetLastError());
 }

-#ifdef __cplusplus
-} // extern "C"
-#endif
\ No newline at end of file
--- a/csrc/weight_norm_bwd_cuda.cu
+++ b/csrc/weight_norm_bwd_cuda.cu
+#include "kernel_utils.cuh"
+
+#include <ATen/ATen.h>
+#include "ATen/AccumulateType.h"
+#include "ATen/cuda/CUDATensorMethods.cuh"
+#include "ATen/cuda/CUDATypeConversion.cuh"
+#include <THC/THCTensorMathReduce.cuh>
+
+template
+  <typename scalar_t, 
+   typename accscalar_t>
+__global__ void weight_norm_bwd_first_dim_kernel
+  (scalar_t* __restrict__ pLpv,
+   scalar_t* __restrict__ pLpg,
+   const scalar_t* __restrict__ pLpw,
+   const scalar_t* __restrict__ savedv,
+   const scalar_t* __restrict__ savedg,
+   const accscalar_t* __restrict__ savedNorms,
+   const int rowSize)
+{
+  // For now, assign one block to each row.
+  const int tid = threadIdx.x;
+  const int row = blockIdx.x;
+  const int stride = blockDim.x;
+
+  // Logical index offset for this flattened row
+  const int rowStart = row*rowSize;
+
+  // Hack to get around nvcc complaining when an smem array is declared with the same name
+  // but different types in different kernels (in this case different instantiations)
+  // extern __shared__ accscalar_t s[]; // error: declaration is incompatible with previous "s"
+  extern __shared__ char buf[];
+  accscalar_t* s = (accscalar_t*)buf;
+  
+  accscalar_t thread_sum = 0.f;
+  for(int i = tid; i < rowSize; i += stride ) 
+  {
+    accscalar_t pLpwi = scalar_cast<accscalar_t>(pLpw[i+rowStart]); 
+    accscalar_t savedvi = scalar_cast<accscalar_t>(savedv[i+rowStart]); 
+    thread_sum += pLpwi*savedvi; // AccumOp, could do Kahan here
+  }
+
+  reduce_block_into_lanes(s, thread_sum, 1, ReduceAdd<accscalar_t>());
+  accscalar_t result = s[0];
+
+  // Could choose to save reciprocal of norm instead I suppose, but norms is probably
+  // more handy to keep around.
+  // Broadcast load; could use shared memory instead.
+  accscalar_t rnorm = 1.f/savedNorms[row];  
+  accscalar_t rnorm3 = rnorm*rnorm*rnorm;
+
+  // Write g gradients.
+  if(tid == 0)
+    pLpg[row] = scalar_cast<scalar_t>(result*rnorm);
+
+  // Broadcast load, could use shared memory instead.
+  accscalar_t g_this_row = scalar_cast<accscalar_t>(savedg[row]);
+   
+  // Write v gradients.  We are reusing values that were loaded earlier, so there 
+  // is an optimization opportunity here (store values persistently).
+  for(int j = tid; j < rowSize; j += stride ) 
+  {
+    accscalar_t pLpwj = scalar_cast<accscalar_t>(pLpw[j+rowStart]);  
+    accscalar_t savedvj = scalar_cast<accscalar_t>(savedv[j+rowStart]);  
+    accscalar_t pLpvj = g_this_row*(rnorm*pLpwj - rnorm3*savedvj*result);
+    pLpv[j+rowStart] = scalar_cast<scalar_t>(pLpvj);
+  }
+}
+
+template 
+  <typename scalar_t, 
+   typename accscalar_t>
+__global__ void weight_norm_bwd_last_dim_kernel
+  (scalar_t* __restrict__ pLpv,
+   scalar_t* __restrict__ pLpg,
+   const scalar_t* __restrict__ pLpw,
+   const scalar_t* __restrict__ savedv,
+   const scalar_t* __restrict__ savedg,
+   const accscalar_t* __restrict__ savedNorms,
+   const int fast_dim_size,
+   const int slower_dims_size)
+{
+  const int fast_dim_location = threadIdx.x + blockIdx.x*blockDim.x;
+
+  extern __shared__ char buf[];
+  accscalar_t* s = (accscalar_t*)buf;
+
+  accscalar_t thread_sum = 0.f;
+
+  int slower_dims_location = threadIdx.y;
+  int currentIdx = fast_dim_location + fast_dim_size*slower_dims_location;
+  if(fast_dim_location < fast_dim_size)
+    while(slower_dims_location < slower_dims_size)
+    {
+      accscalar_t pLpwi = scalar_cast<accscalar_t>(pLpw[currentIdx]); 
+      accscalar_t savedvi = scalar_cast<accscalar_t>(savedv[currentIdx]); 
+      thread_sum += pLpwi*savedvi; // AccumOp, could do Kahan here
+      currentIdx += blockDim.y*fast_dim_size;
+      slower_dims_location += blockDim.y; 
+    }
+
+  reduce_block_into_lanes(s, thread_sum, blockDim.x, ReduceAdd<accscalar_t>()); 
+  accscalar_t result = s[threadIdx.x];
+
+  // Broadcast load; could use shared memory instead.
+  accscalar_t rnorm = 1.f/savedNorms[fast_dim_location];  
+  accscalar_t rnorm3 = rnorm*rnorm*rnorm;
+
+  // Write g gradients.
+  if(threadIdx.y == 0)
+    pLpg[fast_dim_location] = scalar_cast<scalar_t>(result*rnorm);
+
+  // Entire block pulls these values, could use shared memory instead.
+  accscalar_t g_this_col = scalar_cast<accscalar_t>(savedg[fast_dim_location]);
+
+  // Write v gradients.
+  slower_dims_location = threadIdx.y;
+  currentIdx = fast_dim_location + fast_dim_size*slower_dims_location;
+  if(fast_dim_location < fast_dim_size)
+    while(slower_dims_location < slower_dims_size)
+    {
+      accscalar_t pLpwj = scalar_cast<accscalar_t>(pLpw[currentIdx]);  
+      accscalar_t savedvj = scalar_cast<accscalar_t>(savedv[currentIdx]);  
+      accscalar_t pLpvj = g_this_col*(rnorm*pLpwj - rnorm3*savedvj*result);
+      pLpv[currentIdx] = scalar_cast<scalar_t>(pLpvj);
+      currentIdx += blockDim.y*fast_dim_size;
+      slower_dims_location += blockDim.y; 
+    } 
+}
+
+void weight_norm_bwd_cuda
+  (const at::Tensor& pLpv,
+   const at::Tensor& pLpg,
+   const at::Tensor& pLpw,
+   const at::Tensor& savedv,
+   const at::Tensor& savedg,
+   const at::Tensor& savedNorms,
+   int dim)
+{
+#ifdef DEBUG_ANY
+  using namespace std;
+  cout << "Hello from send_to_bwd with pLpw.type = " << pLpw.type << endl;
+#endif
+
+  const int ndims = savedv.ndimension();
+
+  if(dim == 0) 
+  {
+    // Find logical size of each flattened slowest-dim row
+    int rowSize = 1;
+    for(int i = ndims - 1; i > 0; i--)
+      rowSize *= savedv.size(i);
+
+    using namespace at;
+    cudaStream_t stream = globalContext().getCurrentCUDAStream();
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF
+      (savedv.type(), 
+       "weight_norm_bwd_first_dim_kernel",  
+       [&]
+       {
+         using cuda_scalar_t = cuda::type<scalar_t>;
+         using accscalar_t = acc_type<cuda_scalar_t, true>;
+
+	 weight_norm_bwd_first_dim_kernel
+	   <<<pLpw.size(0), 
+	      BLOCK, 
+	      BLOCK*sizeof(accscalar_t),
+              stream>>>
+	   (pLpv.data<cuda_scalar_t>(),
+	    pLpg.data<cuda_scalar_t>(),
+	    pLpw.data<cuda_scalar_t>(),
+	    savedv.data<cuda_scalar_t>(),
+	    savedg.data<cuda_scalar_t>(),
+	    savedNorms.data<accscalar_t>(),
+	    rowSize);
+       });
+  }
+  else if(dim == ndims - 1)
+  {
+    // Precompute slower_dims_size and fast_dim_size because they involve dynamically indexing an array.
+    int slower_dims_size = 1;
+    for(int i = 0; i < ndims - 1; i++)
+      slower_dims_size *= savedv.size(i);
+
+    int fast_dim_size = savedv.size(ndims-1);
+
+    using namespace at;
+    cudaStream_t stream = globalContext().getCurrentCUDAStream();
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF
+      (savedv.type(), 
+       "weight_norm_bwd_last_dim_kernel",  
+       [&]
+       {
+         using cuda_scalar_t = cuda::type<scalar_t>;
+         using accscalar_t = acc_type<cuda_scalar_t, true>;
+
+         weight_norm_bwd_last_dim_kernel
+           <<<(fast_dim_size+TILE_W-1)/TILE_W,
+              dim3(TILE_W,TILE_H), 
+              (TILE_W*TILE_H + TILE_W)*sizeof(accscalar_t),
+              stream>>>
+           (pLpv.data<cuda_scalar_t>(),
+            pLpg.data<cuda_scalar_t>(),
+            pLpw.data<cuda_scalar_t>(),
+            savedv.data<cuda_scalar_t>(),
+            savedg.data<cuda_scalar_t>(),
+            savedNorms.data<accscalar_t>(),
+            fast_dim_size,
+            slower_dims_size);
+       });
+  }
+  // else 
+  // {
+  //   intermediate dim kernel.  Error checking on the dim was already done in 
+  //   Module.cpp:weight_norm_bwd.  Could put that logic here instead, if we include
+  //   <python.h> in both files.
+  // }
+
+  // The kernel execution is asynchronous, so this will only catch errors on the kernel launch,
+  // not the kernel's execution.  Errors in kernel execution aren't guaranteed to be caught
+  // until a later error check on a synchronizing CUDA call.  Unfortunately, without manually 
+  // synchronizing here, this is the best we can do.
+  THCudaCheck(cudaGetLastError());
+
+#ifdef DEBUG_PROFILE
+  THCudaCheck(cudaDeviceSynchronize());
+#endif
+}
--- a/csrc/weight_norm_fwd_cuda.cu
+++ b/csrc/weight_norm_fwd_cuda.cu
+#include "kernel_utils.cuh"
+
+#include <ATen/ATen.h>
+#include "ATen/AccumulateType.h"
+#include "ATen/cuda/CUDATensorMethods.cuh"
+#include "ATen/cuda/CUDATypeConversion.cuh"
+#include <THC/THCTensorMathReduce.cuh>
+
+template
+  <typename scalar_t, 
+   typename accscalar_t>
+__global__ void weight_norm_fwd_first_dim_kernel
+  (scalar_t* __restrict__ w,
+   accscalar_t* __restrict__ norms,
+   const scalar_t* __restrict__ v,
+   const scalar_t* __restrict__ g,
+   const int rowSize) 
+{
+  // We are norming each slowest-dim row of the tensor separately.
+  // For now, assign one block to each row.
+  const int tid = threadIdx.x;
+  const int row = blockIdx.x;
+  const int stride = blockDim.x;
+
+  // Logical index offset for this flattened row
+  const int rowStart = row*rowSize;
+
+  // Hack to get around nvcc complaining when an smem array is declared with the same name
+  // but different types in different kernels (in this case different instantiations)
+  // extern __shared__ accscalar_t s[]; // error: declaration is incompatible with previous "s"
+  extern __shared__ char buf[];
+  accscalar_t* s = (accscalar_t*)buf;
+  
+  accscalar_t thread_sum = 0.f;
+  for(int i = tid; i < rowSize; i += stride ) 
+  {
+    accscalar_t val_f = scalar_cast<accscalar_t>(v[i+rowStart]); 
+    thread_sum += val_f*val_f; // AccumOp, could do Kahan here
+  }
+
+  reduce_block_into_lanes(s, thread_sum, 1, ReduceAdd<accscalar_t>());
+  accscalar_t result = s[0];
+
+  result = sqrtf(result);
+  
+  if(tid == 0)
+    norms[row] = result;
+
+  // Broadcast load, could use shared memory instead.
+  accscalar_t g_this_row = scalar_cast<accscalar_t>(g[row]);
+
+  accscalar_t rnorm = 1.f/result; // for consistency with backward kernel
+
+  // Write data to output
+  for(int i = tid; i < rowSize; i += stride ) 
+  {
+    accscalar_t val_f = scalar_cast<accscalar_t>(v[i+rowStart]);
+    w[i+rowStart] = scalar_cast<scalar_t>(g_this_row*val_f*rnorm);
+  }
+}
+
+template
+  <typename scalar_t, 
+   typename accscalar_t>
+__global__ void weight_norm_fwd_last_dim_kernel
+(
+  scalar_t* __restrict__ w,
+  accscalar_t* __restrict__ norms,
+  const scalar_t* __restrict__ v,
+  const scalar_t* __restrict__ g,
+  const int fast_dim_size,
+  const int slower_dims_size
+)
+{
+  const int fast_dim_location = threadIdx.x + blockIdx.x*blockDim.x;
+
+  extern __shared__ char buf[];
+  accscalar_t* alloc = (accscalar_t*)buf;
+  accscalar_t* s = &alloc[0];
+  accscalar_t* rnorms_this_block = &alloc[blockDim.x*blockDim.y];
+
+  accscalar_t thread_sum = 0.f;
+
+  int slower_dims_location = threadIdx.y;
+  int currentIdx = fast_dim_location + fast_dim_size*slower_dims_location;
+  if(fast_dim_location < fast_dim_size)
+    while(slower_dims_location < slower_dims_size)
+    {
+      accscalar_t val_f = scalar_cast<accscalar_t>(v[currentIdx]); 
+      thread_sum += val_f*val_f; // AccumOp, could do Kahan here
+      currentIdx += blockDim.y*fast_dim_size;
+      slower_dims_location += blockDim.y; 
+    }
+
+  reduce_block_into_lanes(s, thread_sum, blockDim.x, ReduceAdd<accscalar_t>()); 
+
+  // Better to pass an EpilogueOp to reduce_block_into_lanes, implement later
+  if(threadIdx.y == 0)
+  {
+    accscalar_t result = s[threadIdx.x];
+    accscalar_t norm_this_col = sqrtf(result);
+    norms[fast_dim_location] = norm_this_col;
+    rnorms_this_block[threadIdx.x] = 1.f/norm_this_col;
+  }
+   
+  __syncthreads(); 
+
+  accscalar_t g_this_col = scalar_cast<accscalar_t>(g[fast_dim_location]);     
+  accscalar_t rnorm = rnorms_this_block[threadIdx.x]; 
+
+  slower_dims_location = threadIdx.y;
+  currentIdx = fast_dim_location + fast_dim_size*slower_dims_location;
+  if(fast_dim_location < fast_dim_size)
+    while(slower_dims_location < slower_dims_size)
+    {
+      accscalar_t val_f = scalar_cast<accscalar_t>(v[currentIdx]); 
+      w[currentIdx] = scalar_cast<scalar_t>(g_this_col*val_f*rnorm);
+      currentIdx += blockDim.y*fast_dim_size;
+      slower_dims_location += blockDim.y; 
+    } 
+}
+
+
+void weight_norm_fwd_cuda
+  (const at::Tensor& w,
+   const at::Tensor& norms,
+   const at::Tensor& v,
+   const at::Tensor& g,
+   int dim)
+{
+#ifdef DEBUG_ANY
+  using namespace std;
+  cout << "hello from send_to_fwd with v.type = " << v.type << endl;
+#endif
+
+  const int ndims = v.ndimension();
+
+  if(dim == 0) 
+  {
+    // Find logical size of each flattened slowest-dim row
+    int rowSize = 1;
+    for(int i = ndims - 1; i > 0; i--)
+      rowSize *= v.size(i);
+
+    using namespace at;
+    cudaStream_t stream = globalContext().getCurrentCUDAStream();
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF
+      (v.type(), 
+       "weight_norm_fwd_first_dim_kernel",  
+       [&]
+       {
+         using cuda_scalar_t = cuda::type<scalar_t>;
+         using accscalar_t = acc_type<cuda_scalar_t, true>;
+
+         weight_norm_fwd_first_dim_kernel
+           <<<v.size(0), 
+              BLOCK, 
+              BLOCK*sizeof(accscalar_t),
+              stream>>>
+           (w.data<cuda_scalar_t>(), 
+            norms.data<accscalar_t>(),
+            v.data<cuda_scalar_t>(),  
+            g.data<cuda_scalar_t>(),  
+            rowSize);
+       });
+  }
+  else if(dim == ndims - 1)
+  {
+    // Precompute slower_dims_size and fast_dim_size
+    int slower_dims_size = 1;
+    for(int i = 0; i < ndims - 1; i++)
+      slower_dims_size *= v.size(i);
+
+    int fast_dim_size = v.size(ndims-1);
+ 
+    using namespace at;
+    cudaStream_t stream = globalContext().getCurrentCUDAStream();
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF
+      (v.type(), 
+       "weight_norm_fwd_last_dim_kernel",  
+       [&]
+       {
+         using cuda_scalar_t = cuda::type<scalar_t>;
+         using accscalar_t = acc_type<cuda_scalar_t, true>;
+        
+         // just trying this formatting out to see how it feels... 
+         weight_norm_fwd_last_dim_kernel
+           <<<(fast_dim_size+TILE_W-1)/TILE_W,
+              dim3(TILE_W,TILE_H),
+              (TILE_W*TILE_H + TILE_W)*sizeof(accscalar_t),
+              stream>>>
+           (w.data<cuda_scalar_t>(),
+            norms.data<accscalar_t>(),
+            v.data<cuda_scalar_t>(),
+            g.data<cuda_scalar_t>(),
+            fast_dim_size,
+            slower_dims_size);
+       });
+  }
+  // else 
+  // {
+  //   intermediate dim kernel.  Error checking on the dim was already done in 
+  //   Module.cpp:weight_norm_fwd.  Could put that logic here instead, if we include
+  //   <python.h> in both files.
+  // }
+
+  // The kernel execution is asynchronous, so this will only catch errors on the kernel launch,
+  // not the kernel's execution.  Errors in kernel execution aren't guaranteed to be caught
+  // until a later error check on a synchronizing CUDA call.  Unfortunately, without manually 
+  // synchronizing here, this is the best we can do.
+  THCudaCheck(cudaGetLastError());
+
+#ifdef DEBUG_PROFILE
+  THCudaCheck(cudaDeviceSynchronize());
+#endif
+}
+
--- a/include/THCTensorInfo.cuh
+++ b/include/THCTensorInfo.cuh
-#ifndef THC_TENSOR_INFO_INC
-#define THC_TENSOR_INFO_INC
-
-#include <cuda.h>
-#include <cuda_fp16.h>
-#include <assert.h>
-
-// Maximum number of dimensions allowed for cutorch
-#define MAX_CUTORCH_DIMS 10
-
-// Warning string for tensor arguments that are too large or have too many dimensions
-#define CUTORCH_STR(X) #X
-#define CUTORCH_DIM_WARNING "tensor too large or too many (>" \
-  CUTORCH_STR(MAX_CUTORCH_DIMS) ") dimensions"
-
-enum float_types { FLOAT = 0 , HALF = 1, DOUBLE = 2 };
-
-// CUDA kernel argument that defines tensor layout
-template <typename T, typename IndexType>
-struct TensorInfo {
-  
-  TensorInfo(T* p,
-             int dim,
-             IndexType sz[MAX_CUTORCH_DIMS],
-             IndexType st[MAX_CUTORCH_DIMS]);
-  
-  TensorInfo(T* p,
-	     int dim,
-	     IndexType sz[MAX_CUTORCH_DIMS],
-	     IndexType st[MAX_CUTORCH_DIMS],
-	     float_types type);
-
-  //Good way to cast from another format
-  //template <TensorInfo<typename T2, typename I2> >
-  //TensorInfo(TensorInfo<T2, I2> &tinfo_in){
-  //  data = reinterpret_cast<T*>(tinfo_in.data);
-  //}
-  
-  T* data;
-  IndexType sizes[MAX_CUTORCH_DIMS];
-  IndexType strides[MAX_CUTORCH_DIMS];
-  int dims;
-  float_types type;
-};
-
-//Expand our combinations as convenient typedefs
-typedef TensorInfo<half, int> t_hi;
-typedef TensorInfo<half, long> t_hl;
-typedef TensorInfo<float, int> t_fi;
-typedef TensorInfo<float, long> t_fl;
-
-
-template <typename T, typename IndexType>
-TensorInfo<T, IndexType>::TensorInfo(T* p,
-                                     int dim,
-                                     IndexType sz[MAX_CUTORCH_DIMS],
-                                     IndexType st[MAX_CUTORCH_DIMS]) {
-  data = p;
-  dims = dim;
-  assert(dims > 0 && dims < MAX_CUTORCH_DIMS);
-
-  for (int i = 0; i < dim; ++i) {
-    sizes[i] = sz[i];
-    strides[i] = st[i];
-  }
-}
-
-template<typename T, typename IndexType>
-TensorInfo<T, IndexType>::TensorInfo(T* p,
-	   int dim,
-	   IndexType sz[MAX_CUTORCH_DIMS],
-	   IndexType st[MAX_CUTORCH_DIMS],
-	   float_types _type){
-  data = p;
-  dims = dim;
-  assert(dims > 0 && dims < MAX_CUTORCH_DIMS);
-
-  for (int i = 0; i < dim; ++i) {
-    sizes[i] = sz[i];
-    strides[i] = st[i];
-  }
-  type=_type;
-}
-
-
-
-// Translate a linear index for the apply to a T* offset;
-// specialized on `Dims` to reduce nvcc compilation time
-template <typename T, typename IndexType, int Dims>
-struct IndexToOffset {
-  static __forceinline__ __host__ __device__ IndexType get(
-					   IndexType linearId,
-					   const TensorInfo<T, IndexType>& info) {
-    IndexType offset = 0;
-    
-    // Use static dims
-    for (int i = Dims - 1; i > 0; --i) {
-      for (int i = Dims - 1; i > 0; --i) {
-	offset += linearId % info.sizes[i] * info.strides[i];
-	linearId /= info.sizes[i];
-      }
-      
-      offset += linearId * info.strides[0];
-      return offset;
-    }
-  }
-};
-  
-
-  
-// For contiguous tensors, the offset = index
-template <typename T, typename IndexType>
-struct IndexToOffset<T, IndexType, -2> {
-  static __forceinline__ __host__ __device__ IndexType
-    get(IndexType linearId, const TensorInfo<T, IndexType>& info) {
-    return linearId;
-  }
-};
-
-template <typename T, typename IndexType>
-struct IndexToOffset<T, IndexType, -1> {
-  static __forceinline__ __host__ __device__ IndexType get(
-    IndexType linearId,
-    const TensorInfo<T, IndexType>& info) {
-
-    IndexType offset = 0;
-
-    // Use dynamic dims
-    for (int i = info.dims - 1; i >= 0; --i) {
-      IndexType curDimIndex = linearId % info.sizes[i];
-      IndexType curDimOffset = curDimIndex * info.strides[i];
-      offset += curDimOffset;
-
-      linearId /= info.sizes[i];
-    }
-
-    return offset;
-  }
-};
-
-#endif // THC_TENSOR_INFO_INC
--- a/include/kernel.h
+++ b/include/kernel.h
-#include "THCTensorInfo.cuh"
-#include <iostream>
-#include <cstdio>
-#include <cassert>
-#include <cuda.h>
-#include <vector>
-
-#define Dims -2
-#define DEVICE_LINEAR_GET(D_TENSOR, INDEX) D_TENSOR.data[IndexToOffset<T, IndexType, Dims>::get(INDEX, D_TENSOR)]
-#define DEVICE_LINEAR_GET_F(D_TENSOR, INDEX) D_TENSOR.data[IndexToOffset<float, IndexType, Dims>::get(INDEX, D_TENSOR)]
-
-typedef int idxType;
-
-struct send_to_fwd_wrapper
-{
-  template<typename DataType, 
-           typename AccumType, 
-           typename IndexType>
-  static void call(std::vector<TensorInfo<void, idxType>>& tensors, int dim);
-};
-
-struct send_to_bwd_wrapper
-{
-  template<typename DataType,
-           typename AccumType,
-           typename IndexType>
-  static void call(std::vector<TensorInfo<void, idxType>>& tensors, int dim);
-};
-
-template <typename In, typename Out>
-struct ScalarConvert {
-  static __host__ __device__ __forceinline__ Out to(const In v) { return (Out) v; }
-};
-
-#ifdef CUDA_HALF_TENSOR
-template <typename Out>
-struct ScalarConvert<half, Out> {
-  static __host__ __device__ __forceinline__ Out to(const half v) {
-#ifdef __CUDA_ARCH__
-    return (Out) __half2float(v);
-#else
-    return (Out) THC_half2float(v);
-#endif
-  }
-};
-
-template <typename In>
-struct ScalarConvert<In, half> {
-  static __host__ __device__ __forceinline__ half to(const In v) {
-#ifdef __CUDA_ARCH__
-    return __float2half((float) v);
-#else
-    return THC_float2half((float) v);
-#endif
-  }
-};
-
-template <>
-struct ScalarConvert<half, half> {
-  static __host__ __device__ __forceinline__ half to(const half v) {
-    return v;
-  }
-};
-
-#endif
--- a/setup.py
+++ b/setup.py
-import re
-
-import sys
-import os
-import shutil
-import inspect
-
-import distutils
-import distutils.spawn
+import torch.cuda
+from setuptools import setup, find_packages
 from distutils.command.clean import clean
+from torch.utils.cpp_extension import CppExtension, CUDAExtension
+from torch.utils.cpp_extension import CUDA_HOME
+
+# TODO:  multiple modules, so we don't have to route all interfaces through
+# the same interface.cpp file?
+if torch.cuda.is_available() and CUDA_HOME is not None:
+    ext_modules = []
+    extension = CUDAExtension(
+        'apex._C', [
+            'csrc/interface.cpp',
+            'csrc/weight_norm_fwd_cuda.cu',
+            'csrc/weight_norm_bwd_cuda.cu',
+            'csrc/scale_cuda.cu',
+        ],
+        extra_compile_args={'cxx': ['-g'],
+                            'nvcc': ['-O2', '-arch=sm_70']}) # TODO:  compile for all arches.
+    ext_modules.append(extension)
+else:
+    raise RuntimeError("Apex requires Cuda 9.0 or higher")

-from setuptools import setup, Extension, find_packages
-from setuptools.command.install import install
-
-import subprocess
-import ctypes.util
-
-import torch
-
-def find(path, regex_func, collect=False):
-    """
-    Recursively searches through a directory with regex_func and
-    either collects all instances or returns the first instance.
-    
-    Args:
-        path: Directory to search through
-        regex_function: A function to run on each file to decide if it should be returned/collected
-        collect (False) : If True will collect all instances of matching, else will return first instance only 
-    """
-    collection = [] if collect else None
-    for root, dirs, files in os.walk(path):
-        for file in files:
-            if regex_func(file):
-                if collect:
-                    collection.append(os.path.join(root, file))
-                else:
-                    return os.path.join(root, file)
-    return list(set(collection))
-
-
-def findcuda():
-    """
-    Based on PyTorch build process. Will look for nvcc for compilation.
-    Either will set cuda home by enviornment variable CUDA_HOME or will search
-    for nvcc. Returns NVCC executable, cuda major version and cuda home directory.
-    """
-    cuda_path = None
-    CUDA_HOME = None
-
-    CUDA_HOME = os.getenv('CUDA_HOME', '/usr/local/cuda')
-    if not os.path.exists(CUDA_HOME):
-        # We use nvcc path on Linux and cudart path on macOS
-        cudart_path = ctypes.util.find_library('cudart')
-        if cudart_path is not None:
-            cuda_path = os.path.dirname(cudart_path)
-        if cuda_path is not None:
-            CUDA_HOME = os.path.dirname(cuda_path)
-            
-    if not cuda_path and not CUDA_HOME:
-        nvcc_path = find('/usr/local/', re.compile("nvcc").search, False)
-        if nvcc_path:
-            CUDA_HOME = os.path.dirname(nvcc_path)
-            if CUDA_HOME:
-                os.path.dirname(CUDA_HOME)
-
-        if (not os.path.exists(CUDA_HOME+os.sep+"lib64")
-            or not os.path.exists(CUDA_HOME+os.sep+"include") ):
-            raise RuntimeError("Error: found NVCC at ", nvcc_path ," but could not locate CUDA libraries"+
-                               " or include directories.")
-        
-        raise RuntimeError("Error: Could not find cuda on this system."+
-                           " Please set your CUDA_HOME enviornment variable to the CUDA base directory.")
-    
-    NVCC = find(CUDA_HOME+os.sep+"bin", 
-                re.compile('nvcc$').search)
-    print("Found NVCC = ", NVCC)
-
-    # Parse output of nvcc to get cuda major version
-    nvcc_output = subprocess.check_output([NVCC, '--version']).decode("utf-8")
-    CUDA_LIB = re.compile(', V[0-9]+\.[0-9]+\.[0-9]+').search(nvcc_output).group(0).split('V')[1]
-    print("Found CUDA_LIB = ", CUDA_LIB)
-
-    if CUDA_LIB:
-        try:
-            CUDA_VERSION = int(CUDA_LIB.split('.')[0])
-        except (ValueError, TypeError):
-            CUDA_VERSION = 9
-    else:
-         CUDA_VERSION = 9   
-
-    if CUDA_VERSION < 8:
-        raise RuntimeError("Error: APEx requires CUDA 8 or newer")
-
-    
-    return NVCC, CUDA_VERSION, CUDA_HOME
-
-#Get some important paths
-curdir = os.path.dirname(os.path.abspath(inspect.stack()[0][1]))
-buildir = curdir+os.sep+"build"
-if not os.path.exists(buildir):
-    os.makedirs(buildir)
-
-torch_dir = os.path.split(torch.__file__)[0] + os.sep + "lib"
-
-cuda_files = find(curdir, lambda file: file.endswith(".cu"), True)
-cuda_headers = find(curdir, lambda file: file.endswith(".cuh"), True)
-headers = find(curdir, lambda file: file.endswith(".h"), True)
-
-libaten = list(set(find(torch_dir, re.compile("libaten", re.IGNORECASE).search, True)))
-libaten_names = [os.path.splitext(os.path.basename(entry))[0] for entry in libaten]
-for i, entry in enumerate(libaten_names):
-    if entry[:3]=='lib':
-        libaten_names[i] = entry[3:]
-
-aten_h = find(torch_dir, re.compile("aten.h", re.IGNORECASE).search, False)
-
-torch_inc = os.path.dirname(os.path.dirname(aten_h))
-include_dirs = [torch_inc]
-library_dirs = []
-for file in cuda_headers+headers:
-    dir = os.path.dirname(file)
-    if dir not in include_dirs:
-        include_dirs.append(dir)
-
-# Object files that use the PyTorch cffi-extension interface
-# They need special handling during compilation
-cffi_objects = ['scale_kernel.o']
-
-assert libaten, "Could not find PyTorch's libATen."
-assert aten_h, "Could not find PyTorch's ATen header."
-
-library_dirs.append(os.path.dirname(libaten[0]))
-
-#create some places to collect important things
-object_files = []
-extra_link_args=[]
-main_libraries = []
-main_libraries += ['cudart',]+libaten_names
-extra_compile_args = ["--std=c++11",]
-
-#findcuda returns root dir of CUDA
-#include cuda/include and cuda/lib64 for python module build.
-NVCC, CUDA_VERSION, CUDA_HOME=findcuda()
-library_dirs.append(os.path.join(CUDA_HOME, "lib64"))
-include_dirs.append(os.path.join(CUDA_HOME, 'include'))
-
-class RMBuild(clean):
-    def run(self):
-        #BE VERY CAUTIOUS WHEN USING RMTREE!!!
-        #These are some carefully written/crafted directories
-        if os.path.exists(buildir):
-            shutil.rmtree(buildir)
-            
-        distdir = curdir+os.sep+"dist"
-        if os.path.exists(distdir):
-            shutil.rmtree(distdir)
-
-        eggdir = curdir+os.sep+"apex.egg-info"
-        if os.path.exists(eggdir):
-            shutil.rmtree(eggdir)
-        clean.run(self)
-
-def CompileCudaFiles(NVCC, CUDA_VERSION):
-
-        print()
-        print("Compiling cuda modules with nvcc:")
-        gencodes =  ['-gencode', 'arch=compute_52,code=sm_52',
-                    '-gencode', 'arch=compute_60,code=sm_60',
-                    '-gencode', 'arch=compute_61,code=sm_61',]
-        
-        if CUDA_VERSION > 8:
-            gencodes += ['-gencode', 'arch=compute_70,code=sm_70',
-                         '-gencode', 'arch=compute_70,code=compute_70',]
-        #Need arches to compile for. Compiles for 70 which requires CUDA9
-        nvcc_cmd = [NVCC, 
-                       '-Xcompiler', 
-                       '-fPIC'
-                   ] + gencodes + [
-                       '--std=c++11',
-                       '-O3',
-                   ]
-        
-        for dir in include_dirs:
-            nvcc_cmd.append("-I"+dir)
-
-        # Hack: compiling the cffi kernel code needs the TH{C}
-        #       subdirs of include on path as well
-        for suffix in ['TH', 'THC']:
-            nvcc_cmd.append('-I{}/{}'.format(torch_inc, suffix))
-
-        for file in cuda_files:
-            object_name = os.path.basename(
-                os.path.splitext(file)[0]+".o"
-            )
-
-            object_file = os.path.join(buildir, object_name)
-            object_files.append(object_file)
-    
-            file_opts = ['-c', file, '-o', object_file]
-
-            extra_args = []
-            if object_name in cffi_objects:
-                for module in ['TH', 'THC']:
-                    extra_args.append('-I{}/{}'.format(torch_inc, module))
-
-            build_args = nvcc_cmd + extra_args + file_opts
-            print(' '.join(build_args))
-            subprocess.check_call(build_args)
-            
-        for object_file in object_files:
-            extra_link_args.append(object_file)
-if 'clean' not in sys.argv:
-
-    print()
-    print("Arguments used to build CUDA extension:")
-    print("extra_compile_args :", extra_compile_args)
-    print("include_dirs: ", include_dirs)
-    print("extra_link_args: ", extra_link_args)
-    print("library_dirs: ", library_dirs)
-    print("libraries: ", main_libraries)
-    print()
-    CompileCudaFiles(NVCC, CUDA_VERSION)
-
-    print("Building CUDA extension.")
-    
-cuda_ext = Extension('apex._C',
-                 [os.path.join('csrc', 'Module.cpp')],
-                 extra_compile_args = extra_compile_args,
-                 include_dirs=include_dirs,
-                 extra_link_args=extra_link_args,
-                 library_dirs=library_dirs,
-                 runtime_library_dirs = library_dirs,
-                 libraries=main_libraries
-    )
-
-if 'clean' not in sys.argv:
-    print("Building module.")
-    
 setup(
-    name='apex', version='0.1',
-    cmdclass={
-        'clean' : RMBuild,
-    },  
-    ext_modules=[cuda_ext,],
+    name='apex',
+    version='0.1',
+    packages=find_packages(exclude=('build', 
+                                    'csrc', 
+                                    'include', 
+                                    'tests', 
+                                    'dist',
+                                    'docs',
+                                    'tests',
+                                    'examples',
+                                    'apex.egg-info',)),
+    ext_modules=ext_modules,
    description='PyTorch Extensions written by NVIDIA',
-    packages=find_packages(exclude=("build", "csrc", "include", "tests")),
-
-    # Require cffi
-    install_requires=["cffi>=1.0.0"],
-    setup_requires=["cffi>=1.0.0"],
-    cffi_modules=[os.path.join(os.path.dirname(__file__),
-                               'build_cffi.py:extension')],
+    cmdclass={'build_ext': torch.utils.cpp_extension.BuildExtension},
 )