Initial release

2fa4dbaf · Christian Sarofeen · 2fa4dbaf · 2fa4dbaf · 2fa4dbaf · 2fa4dbaf
Commit 2fa4dbaf authored Apr 25, 2018 by Christian Sarofeen
20 changed files
--- a/csrc/Module.cpp
+++ b/csrc/Module.cpp
+#define PY_SSIZE_T_CLEAN
+#define ARG_OFFSET 5
+
+#include <Python.h>
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <vector>
+#include <string>
+#include <cmath>
+#include <cassert>
+#include <iostream>
+
+// #define USE_NVTX
+#ifdef USE_NVTX
+#include "nvToolsExt.h"
+#endif
+
+//Meta-data format we will use
+#include <THCTensorInfo.cuh>
+
+//Cuda kernels
+#include <kernel.h>
+
+#define ERROR_MSG cout << "Error at " << __FILE__ << ":" << __LINE__ << "\n";
+
+using namespace std;
+
+TensorInfo<void, idxType> PyOb_2_tinfo(PyObject* tensor, float_types data_type)
+{
+  PyObject* PyStrides = PyObject_CallMethod(tensor, "stride", NULL);
+  if(PyStrides == NULL)
+  {
+    ERROR_MSG;
+    cout << "PyStrides = NULL" << endl;
+  }
+
+  PyObject* PySizes = PyObject_CallMethod(tensor, "size", NULL);
+  if(PySizes == NULL)
+  {
+    ERROR_MSG;
+    cout << "PySizes = NULL" << endl;
+  }
+
+  PyObject* PyDataPtr = PyObject_CallMethod(tensor, "data_ptr", NULL);
+  if(PyDataPtr == NULL)
+  {
+    ERROR_MSG;
+    cout << "PyDataPtr = NULL" << endl;
+  }
+
+  void* data_ptr = (void*) PyLong_AsLong(PyDataPtr);
+  Py_ssize_t ndims = PyList_GET_SIZE(PySizes);
+
+  //TODO put proper checking on ndims < MAX_CUTORCH_DIMS
+  idxType strides[MAX_CUTORCH_DIMS], sizes[MAX_CUTORCH_DIMS];
+
+  for(int i = 0; i < ndims; i++)
+  {
+    strides[i] = PyLong_AsLong(PyTuple_GetItem(PyStrides, i));
+    sizes[i] = PyLong_AsLong(PyTuple_GetItem(PySizes, i));
+  }
+
+  // Reference counts still behave strangely, but at least these appear to cap 
+  // the process' memory usage.
+  Py_DECREF(PyStrides);
+  Py_DECREF(PySizes);
+  Py_DECREF(PyDataPtr);
+
+  return TensorInfo<void, idxType>(data_ptr, ndims, sizes, strides, data_type);
+}
+
+vector<TensorInfo<void, idxType> > get_TInfos(PyObject* args)
+{
+  vector<TensorInfo<void, idxType> > info_vec;
+#ifdef DEBUG_ANY 
+  cout << "Processing " << PyTuple_GET_SIZE(args) << " arguments" << endl;
+#endif
+
+#ifdef CHECK_MEMLEAK
+  for(int iter = 0; iter < 1e7; iter++ )
+#endif
+    for(Py_ssize_t i = 0; i<PyTuple_GET_SIZE(args) - 1; i++)
+    {
+      PyObject* pyTensor = PyTuple_GetItem(args, i);
+
+      // check type, only take if Tensor, Variable, or Parameter
+      string objType(pyTensor->ob_type->tp_name);
+
+      PyObject* pyObjTypeCall = PyObject_CallMethod(pyTensor, "type", NULL);
+      if(pyObjTypeCall == NULL)
+      {
+	ERROR_MSG;
+	cout << "For args item " << i << ", pyObjTypeCall = NULL" << endl;
+      }
+
+      // This gives a segfault:
+      // cout << "pyObjTypeCall direct conversion attempt = " << 
+      //          PyBytes_AsString(pyObjTypeCall) << endl;
+
+      PyObject* pyObjASCII = PyUnicode_AsASCIIString(pyObjTypeCall);
+      if(pyObjASCII == NULL)
+      {
+	ERROR_MSG;
+	cout << "For args item " << i << ", pyObjASCII = NULL " << endl;
+      }
+
+      // cout << "Py_REFCNT(pyObjTypeCall) = " << Py_REFCNT(pyObjTypeCall) << endl;
+      Py_DECREF(pyObjTypeCall);
+
+      string objTypeCall(PyBytes_AsString(pyObjASCII));
+
+      // cout << "Py_REFCNT(pyObjASCII) = " << Py_REFCNT(pyObjASCII) << endl;
+      Py_DECREF(pyObjASCII);
+
+#ifdef DEBUG_ANY
+      cout << "arg " << i << endl;
+      cout << "objType = " << objType << endl;
+      cout << "objTypeCall = " << objTypeCall << endl;
+#endif
+
+      if(objTypeCall == "torch.cuda.FloatTensor")
+#ifdef CHECK_MEMLEAK
+	if(iter == 0 )
+#endif
+	  info_vec.push_back(PyOb_2_tinfo(pyTensor, FLOAT));
+#ifdef CHECK_MEMLEAK
+	else
+	  info_vec[i] = PyOb_2_tinfo(pyTensor, FLOAT);
+#endif
+      else if(objTypeCall == "torch.cuda.HalfTensor")
+	info_vec.push_back(PyOb_2_tinfo(pyTensor, HALF));
+      // Could add double
+      else
+      {
+	ERROR_MSG;
+	cout << "For args item " << i << ", unsupported .type() found: "
+	     << objTypeCall << "\n"
+		"Supported types:\n"
+		"torch.cuda.FloatTensor\n"
+		"torch.cuda.HalfTensor\n"
+		"torch.autograd.variable.Variable containing FloatTensor\n"
+		"torch.autograd.variable.Variable containing HalfTensor\n"
+		"torch.nn.parameter.Parameter containing FloatTensor\n" 
+		"torch.nn.parameter.Parameter containing HalfTensor\n" 
+	     << endl;
+      }
+    }
+
+  // PyErr_SetString(PyExc_RuntimeError, "Exception set in  ");
+
+  return info_vec;
+}
+
+int getLastArg_AsInt(PyObject* args)
+{
+  // None of these should return new references so I don't think this leaks memory.
+  int dims = PyLong_AsLong(PyTuple_GetItem(args, PyTuple_GET_SIZE(args) - 1));
+  return dims;
+}
+
+// Stepping stone, can evolve to be more general (argument forwarding?)
+template<typename wrapper>
+void dispatch
+(
+  float_types rtti, 
+  vector<TensorInfo<void, idxType>>& tensors, 
+  int dim
+)
+{
+  switch(rtti)
+  {
+    case FLOAT:
+      wrapper::template call<float, float, idxType>(tensors, dim);
+      break;
+    case HALF:
+      wrapper::template call<half, float, idxType>(tensors, dim);
+      break;
+    default:
+      std::cout << "Unsupported rtti in Module.cpp:dispatch()" << std::endl;
+      PyErr_SetString(PyExc_RuntimeError, "Unsupported data type in Module.cpp:dispatch, "
+                                          "supported data types are half and float");
+      exit(-1);
+  }
+}
+
+//Will extract all tensors in order. Assumes flat structure, tensors can not be wrapped in lists
+//tuples or any other iterator structure.
+static PyObject* weight_norm_fwd(PyObject* self, PyObject* args)
+{
+#ifdef USE_NVTX
+nvtxRangePushA("weight_norm_fwd C backend");
+#endif
+
+  vector<TensorInfo<void, idxType> > tensors = get_TInfos(args);
+  int dim = getLastArg_AsInt(args);
+
+  if(dim != 0 && dim != tensors[2].dims - 1)
+    PyErr_SetString(PyExc_RuntimeError, "weight_norm_fwd currently only "
+                                        "supports first or last dimension.");
+  else
+  {
+#ifdef DEBUG_ANY
+    cout << "tensors.size() = " << tensors.size() << ", dim = " << dim << endl;
+#endif
+ 
+    dispatch<send_to_fwd_wrapper>(tensors[0].type, tensors, dim);
+
+#ifdef USE_NVTX
+    nvtxRangePop();
+#endif
+  }
+
+  Py_RETURN_NONE;
+}
+
+static PyObject* weight_norm_bwd(PyObject* self, PyObject* args)
+{
+#ifdef USE_NVTX
+  nvtxRangePushA("weight_norm_bwd C backend");
+#endif
+
+  vector<TensorInfo<void, idxType> >tensors = get_TInfos(args);
+  int dim = getLastArg_AsInt(args);
+
+  if(dim != 0 && dim != tensors[3].dims - 1)
+    PyErr_SetString(PyExc_RuntimeError, "weight_norm_bwd currently only "
+                                        "supports first or last dimension.");
+  else
+  {
+#ifdef DEBUG_ANY
+    cout << "tensors.size() = " << tensors.size() << ", dim = " << dim << endl;
+#endif
+
+    dispatch<send_to_bwd_wrapper>(tensors[0].type, tensors, dim);
+
+#ifdef USE_NVTX
+    nvtxRangePop();
+#endif
+  }
+
+  Py_RETURN_NONE;
+}
+
+//*******************PYTHON BOILER PLATE*******************
+static PyMethodDef apex_methods[] = {
+  {"weight_norm_fwd", (PyCFunction) weight_norm_fwd, METH_VARARGS, "Slowest-dim norm, forward pass."},
+  {"weight_norm_bwd", (PyCFunction) weight_norm_bwd, METH_VARARGS, "Slowest-dim norm, backward pass."},
+  {NULL, NULL, 0, NULL}
+};
+
+#if PY_MAJOR_VERSION >= 3
+
+//Module Definitions
+static struct PyModuleDef apex = {
+  PyModuleDef_HEAD_INIT, "apex._C", "Module to add CUDA extensions to Pytorch.", -1, apex_methods
+};
+//Initialization Function
+PyMODINIT_FUNC PyInit__C(void){
+
+  //Let's throw an error if we can't find pytorch.
+  PyImport_ImportModule("torch");
+  Py_Initialize();
+  return PyModule_Create(&apex);
+}
+#else
+PyMODINIT_FUNC initMODULE(void){
+  //Let's throw an error if we can't find pytorch.
+  PyImport_ImportModule("torch");
+  (void) Py_InitModule3("apex._C", apex, "A PyTorch Extension.");
+}
+
+#endif
+//*********************************************************
+
--- a/csrc/kernel.cu
+++ b/csrc/kernel.cu
+#include "../include/kernel.h"
+
+template<typename T> struct TtoInt { static const int test = -1; };
+template<> struct TtoInt<float> { static const int test = 0; }; 
+template<> struct TtoInt<half> { static const int test = 0; }; 
+template<> struct TtoInt<double> { static const int test = 0; }; 
+
+#if __CUDACC_VER_MAJOR__ >= 9
+#define __SHFL_DOWN(var, delta)  __shfl_down_sync(0xffffffff, var, delta)
+#else
+#define __SHFL_DOWN(var, delta)  __shfl_down(var, delta)
+#endif
+
+#if __CUDACC_VER_MAJOR__ >= 9
+#define __SYNCWARP __syncwarp()
+#else
+#define __SYNCWARP 
+#endif
+
+// Block size for weight_norm_*_first_dim_kernel.
+// Currently, kernels are non-persistent.
+// Dialing up the block size to, say 1024, can improve performance by
+// increase the amount of cache available per block, which can improve cache hit rate.
+// However, this is less efficient for short rows.  256 is pretty versatile. 
+// Implement some heuristics later?
+#define BLOCK 256
+
+// Block size for weight_norm_*_last_dim_kernel.
+// This is tricker than the first_dim case because we must make blocks 
+// at least 16 fast elements wide to ensure fully-coalesced half-precision accesses.
+// Since output-element parallelism is along the fast dimension, this reduces the number of 
+// blocks we can launch by 16X.  
+#define TILE_W 16
+// Somewhat versatile strategy: max out intra-block parallelism by extending
+// blocks across the slow dimension up to the hardware-max block size of 1024.
+#define TILE_H 64
+
+using namespace std;
+
+// lanes is intended to be <= 32.
+template <typename T>
+__device__ __forceinline__ void reduce_block_into_lanes(T *x, T val, int lanes) 
+{ 
+  int tid = threadIdx.x + threadIdx.y*blockDim.x;
+  int blockSize = blockDim.x*blockDim.y;
+
+  if(blockSize >= 64)
+  {
+    x[tid] = val;
+    __syncthreads();
+  }
+  
+  #pragma unroll
+  for(int i = (blockSize >> 1); i >= 64; i >>= 1) 
+  {
+    if(tid < i)
+      x[tid] += x[tid+i]; // JoinOp
+    __syncthreads();
+  }
+
+  if(tid < 32) 
+  {
+    T final;
+    if(blockSize >= 64)
+      final = x[tid] + x[tid+32]; // JoinOp
+    else
+      final = val;
+    // __SYNCWARP();
+
+    #pragma unroll
+    for(int i = 16; i >= lanes; i >>= 1)
+      final += __SHFL_DOWN(final, i);
+
+    if(tid < lanes) 
+      x[tid] = final; // EpilogueOp
+  }
+
+  // Make sure the smem result is visible to all warps.
+  __syncthreads();
+}
+
+template <typename T, typename IndexType>
+__global__ void weight_norm_fwd_first_dim_kernel
+(
+  TensorInfo<T, IndexType> w,
+  TensorInfo<float, IndexType> norms,
+  TensorInfo<T, IndexType> v,
+  TensorInfo<T, IndexType> g,
+  IndexType rowSize
+)
+{
+  // We are norming each slowest-dim row of the tensor separately.
+  // For now, assign one block to each row.
+  IndexType tid = threadIdx.x;
+  IndexType row = blockIdx.x;
+  IndexType stride = blockDim.x;
+
+  // Logical index offset for this flattened row
+  IndexType rowStart = row*rowSize;
+
+  extern __shared__ float s[];
+  
+  float thread_sum = 0.f;
+  for(IndexType i = tid; i < rowSize; i += stride ) 
+  {
+    float val_f = ScalarConvert<T, float>::to(DEVICE_LINEAR_GET(v, i + rowStart)); 
+    thread_sum += val_f*val_f; // AccumOp, could do Kahan here
+  }
+
+  reduce_block_into_lanes(s, thread_sum, 1);
+  float result = s[0];
+
+  result = sqrtf(result);
+  
+  if(tid == 0)
+    DEVICE_LINEAR_GET_F(norms, row) = result;
+
+  // Broadcast load, could use shared memory instead.
+  float g_this_row = ScalarConvert<T, float>::to(DEVICE_LINEAR_GET(g, row));
+
+  float rnorm = 1.f/result; // for consistency with backward kernel
+
+  // Write data to output
+  for(IndexType i = tid; i < rowSize; i += stride ) 
+  {
+    float val_f = ScalarConvert<T, float>::to(DEVICE_LINEAR_GET(v, i + rowStart));
+    DEVICE_LINEAR_GET(w, i + rowStart) = ScalarConvert<float,T>::to(g_this_row*val_f*rnorm);
+  }
+}
+
+template <typename T, typename IndexType>
+__global__ void weight_norm_fwd_last_dim_kernel
+(
+  TensorInfo<T, IndexType> w,
+  TensorInfo<float, IndexType> norms,
+  TensorInfo<T, IndexType> v,
+  TensorInfo<T, IndexType> g,
+  IndexType fast_dim_size,
+  IndexType slower_dims_size
+)
+{
+  IndexType fast_dim_location = threadIdx.x + blockIdx.x*blockDim.x;
+
+  extern __shared__ float alloc[];
+  float* s = &alloc[0];
+  float* rnorms_this_block = &alloc[blockDim.x*blockDim.y];
+
+  float thread_sum = 0.f;
+
+  IndexType slower_dims_location = threadIdx.y;
+  IndexType currentIdx = fast_dim_location + fast_dim_size*slower_dims_location;
+  if(fast_dim_location < fast_dim_size)
+    while(slower_dims_location < slower_dims_size)
+    {
+      float val_f = ScalarConvert<T, float>::to(DEVICE_LINEAR_GET(v, currentIdx)); 
+      thread_sum += val_f*val_f; // AccumOp, could do Kahan here
+      currentIdx += blockDim.y*fast_dim_size;
+      slower_dims_location += blockDim.y; 
+    }
+
+  reduce_block_into_lanes(s, thread_sum, blockDim.x); 
+
+  // Better to pass an EpilogueOp to reduce_block_into_lanes, can try later
+  if(threadIdx.y == 0)
+  {
+    float result = s[threadIdx.x];
+    float norm_this_col = sqrtf(result);
+    DEVICE_LINEAR_GET_F(norms, fast_dim_location) = norm_this_col;
+    rnorms_this_block[threadIdx.x] = 1.f/norm_this_col;
+    // printf("blockIdx.x = %d, threadIdx.x = %d, norm_this_col  = %f\n", 
+    //         blockIdx.x,      threadIdx.x,      norm_this_col);
+  }
+   
+  __syncthreads(); 
+
+  float g_this_col = ScalarConvert<T, float>::to(DEVICE_LINEAR_GET(g, fast_dim_location));     
+
+  float rnorm = rnorms_this_block[threadIdx.x]; 
+
+  slower_dims_location = threadIdx.y;
+  currentIdx = fast_dim_location + fast_dim_size*slower_dims_location;
+  if(fast_dim_location < fast_dim_size)
+    while(slower_dims_location < slower_dims_size)
+    {
+      float val_f = ScalarConvert<T, float>::to(DEVICE_LINEAR_GET(v, currentIdx)); 
+      DEVICE_LINEAR_GET(w, currentIdx) = ScalarConvert<float,T>::to(g_this_col*val_f*rnorm);
+      currentIdx += blockDim.y*fast_dim_size;
+      slower_dims_location += blockDim.y; 
+    } 
+}
+
+template <typename T, typename IndexType>
+__global__ void weight_norm_bwd_first_dim_kernel
+(
+  TensorInfo<T, IndexType> pLpv,
+  TensorInfo<T, IndexType> pLpg,
+  TensorInfo<T, IndexType> pLpw,
+  TensorInfo<T, IndexType> savedv,
+  TensorInfo<T, IndexType> savedg,
+  TensorInfo<float, IndexType> savedNorms,
+  IndexType rowSize
+)
+{
+  // For now, assign one block to each row.
+  IndexType tid = threadIdx.x;
+  IndexType row = blockIdx.x;
+  IndexType stride = blockDim.x;
+
+  // Logical index offset for this flattened row
+  IndexType rowStart = row*rowSize;
+
+  extern __shared__ float s[];
+  
+  float thread_sum = 0.f;
+  for(IndexType i = tid; i < rowSize; i += stride ) 
+  {
+    float pLpwi = ScalarConvert<T, float>::to(DEVICE_LINEAR_GET(pLpw, i + rowStart)); 
+    float savedvi = ScalarConvert<T, float>::to(DEVICE_LINEAR_GET(savedv, i + rowStart)); 
+    thread_sum += pLpwi*savedvi; // AccumOp, could do Kahan here
+  }
+
+  reduce_block_into_lanes(s, thread_sum, 1);
+  float result = s[0];
+
+  // Could choose to save reciprocal of norm instead I suppose, but norms is probably
+  // more handy to keep around.
+  // Broadcast load; could use shared memory instead.
+  float rnorm = 1.f/DEVICE_LINEAR_GET_F(savedNorms, row);  
+  float rnorm3 = rnorm*rnorm*rnorm;
+
+  // Write g gradients.
+  if(tid == 0)
+    DEVICE_LINEAR_GET(pLpg, row) = ScalarConvert<float, T>::to(result*rnorm);
+
+  // Broadcast load, could use shared memory instead.
+  float g_this_row = ScalarConvert<T, float>::to(DEVICE_LINEAR_GET(savedg, row));
+   
+  // Write v gradients.  We are reusing values that were loaded earlier, so there 
+  // is an optimization opportunity here (store values persistently).
+  for(IndexType j = tid; j < rowSize; j += stride ) 
+  {
+    float pLpwj = ScalarConvert<T, float>::to(DEVICE_LINEAR_GET(pLpw, j + rowStart));  
+    float savedvj = ScalarConvert<T, float>::to(DEVICE_LINEAR_GET(savedv, j + rowStart));  
+    float pLpvj = g_this_row*(rnorm*pLpwj - rnorm3*savedvj*result);
+    DEVICE_LINEAR_GET(pLpv, j + rowStart) = ScalarConvert<float,T>::to(pLpvj);
+  }
+}
+
+template <typename T, typename IndexType>
+__global__ void weight_norm_bwd_last_dim_kernel
+(
+  TensorInfo<T, IndexType> pLpv,
+  TensorInfo<T, IndexType> pLpg,
+  TensorInfo<T, IndexType> pLpw,
+  TensorInfo<T, IndexType> savedv,
+  TensorInfo<T, IndexType> savedg,
+  TensorInfo<float, IndexType> savedNorms,
+  IndexType fast_dim_size,
+  IndexType slower_dims_size
+)
+{
+  IndexType fast_dim_location = threadIdx.x + blockIdx.x*blockDim.x;
+
+  extern __shared__ float s[];
+
+  float thread_sum = 0.f;
+
+  IndexType slower_dims_location = threadIdx.y;
+  IndexType currentIdx = fast_dim_location + fast_dim_size*slower_dims_location;
+  if(fast_dim_location < fast_dim_size)
+    while(slower_dims_location < slower_dims_size)
+    {
+      float pLpwi = ScalarConvert<T, float>::to(DEVICE_LINEAR_GET(pLpw, currentIdx)); 
+      float savedvi = ScalarConvert<T, float>::to(DEVICE_LINEAR_GET(savedv, currentIdx)); 
+      thread_sum += pLpwi*savedvi; // AccumOp, could do Kahan here
+      currentIdx += blockDim.y*fast_dim_size;
+      slower_dims_location += blockDim.y; 
+    }
+
+  reduce_block_into_lanes(s, thread_sum, blockDim.x); 
+  float result = s[threadIdx.x];
+
+  // Broadcast load; could use shared memory instead.
+  float rnorm = 1.f/DEVICE_LINEAR_GET_F(savedNorms, fast_dim_location);  
+  float rnorm3 = rnorm*rnorm*rnorm;
+
+  // Write g gradients.
+  if(threadIdx.y == 0)
+    DEVICE_LINEAR_GET(pLpg, fast_dim_location) = ScalarConvert<float, T>::to(result*rnorm);
+
+  // Entire block pulls these values, could use shared memory instead.
+  float g_this_col = ScalarConvert<T, float>::to(DEVICE_LINEAR_GET(savedg, fast_dim_location));
+
+  // Write v gradients.
+  slower_dims_location = threadIdx.y;
+  currentIdx = fast_dim_location + fast_dim_size*slower_dims_location;
+  if(fast_dim_location < fast_dim_size)
+    while(slower_dims_location < slower_dims_size)
+    {
+      float pLpwj = ScalarConvert<T, float>::to(DEVICE_LINEAR_GET(pLpw, currentIdx));  
+      float savedvj = ScalarConvert<T, float>::to(DEVICE_LINEAR_GET(savedv, currentIdx));  
+      float pLpvj = g_this_col*(rnorm*pLpwj - rnorm3*savedvj*result);
+      DEVICE_LINEAR_GET(pLpv, currentIdx) = ScalarConvert<float,T>::to(pLpvj);
+      currentIdx += blockDim.y*fast_dim_size;
+      slower_dims_location += blockDim.y; 
+    } 
+}
+
+template<typename DataType, 
+         typename AccumType,
+         typename IndexType>
+void send_to_fwd_wrapper::call
+(
+  vector<TensorInfo<void, idxType>>& tensors,
+  int dim
+)
+{
+#ifdef DEBUG_ANY
+  cout << "hello from send_to_fwd with v.type = " << v.type << endl;
+#endif
+
+  auto w    (*((TensorInfo<DataType , idxType>*)&tensors[0]));
+  auto norms(*((TensorInfo<AccumType, idxType>*)&tensors[1]));
+  auto v    (*((TensorInfo<DataType , idxType>*)&tensors[2]));
+  auto g    (*((TensorInfo<DataType , idxType>*)&tensors[3]));
+
+  if(dim == 0) 
+  {
+    // Find logical size of each flattened slowest-dim row
+    IndexType rowSize = 1;
+    for(IndexType i = v.dims - 1; i > 0; i--)
+      rowSize *= v.sizes[i];
+
+    weight_norm_fwd_first_dim_kernel<<<v.sizes[0], BLOCK, BLOCK*sizeof(float)>>>
+    (
+      w, 
+      norms,
+      v,  
+      g,  
+      rowSize
+    );
+  }
+  else if(dim == v.dims - 1)
+  {
+    // Precompute slower_dims_size and fast_dim_size because they involve dynamically indexing an array.
+    IndexType slower_dims_size = 1;
+    for(IndexType i = 0; i < v.dims - 1; i++)
+      slower_dims_size *= v.sizes[i];
+
+    int fast_dim_size = v.sizes[v.dims-1];
+
+    weight_norm_fwd_last_dim_kernel<<<(fast_dim_size+TILE_W-1)/TILE_W,
+                                       dim3(TILE_W,TILE_H), 
+                                       (TILE_W*TILE_H + TILE_W)*sizeof(float)>>>
+    (
+      w, 
+      norms,
+      v,  
+      g,  
+      fast_dim_size,
+      slower_dims_size 
+    );
+  }
+  // else 
+  // {
+  //   intermediate dim kernel.  Error checking on the dim was already done in 
+  //   Module.cpp:weight_norm_fwd.  Could put that logic here instead, if we include
+  //   <python.h> in both files.
+  // }
+
+#ifdef DEBUG_PROFILE
+  cudaDeviceSynchronize();
+#endif
+}
+
+// template <typename T, typename IndexType>
+template<typename DataType,
+         typename AccumType,
+         typename IndexType>
+void send_to_bwd_wrapper::call
+(
+  vector<TensorInfo<void, idxType>>& tensors,
+  int dim
+)
+{
+#ifdef DEBUG_ANY
+  cout << "Hello from send_to_bwd with pLpw.type = " << pLpw.type << endl;
+#endif
+
+  // this feels sinful
+  auto pLpv      (*((TensorInfo<DataType , idxType>*)&tensors[0]));
+  auto pLpg      (*((TensorInfo<DataType , idxType>*)&tensors[1]));
+  auto pLpw      (*((TensorInfo<DataType , idxType>*)&tensors[2]));
+  auto savedv    (*((TensorInfo<DataType , idxType>*)&tensors[3]));
+  auto savedg    (*((TensorInfo<DataType , idxType>*)&tensors[4]));
+  auto savedNorms(*((TensorInfo<AccumType, idxType>*)&tensors[5]));
+
+  if(dim == 0) 
+  {
+    // Find logical size of each flattened slowest-dim row
+    IndexType rowSize = 1;
+    for(IndexType i = savedv.dims - 1; i > 0; i--)
+      rowSize *= savedv.sizes[i];
+
+    weight_norm_bwd_first_dim_kernel<<<pLpw.sizes[0], BLOCK, BLOCK*sizeof(float)>>>
+    (
+      pLpv,
+      pLpg,
+      pLpw,
+      savedv,
+      savedg,
+      savedNorms,
+      rowSize
+    );
+  }
+  else if(dim == savedv.dims - 1)
+  {
+    // Precompute slower_dims_size and fast_dim_size because they involve dynamically indexing an array.
+    IndexType slower_dims_size = 1;
+    for(IndexType i = 0; i < savedv.dims - 1; i++)
+      slower_dims_size *= savedv.sizes[i];
+
+    int fast_dim_size = savedv.sizes[savedv.dims-1];
+
+    weight_norm_bwd_last_dim_kernel<<<(fast_dim_size+TILE_W-1)/TILE_W,
+                                       dim3(TILE_W,TILE_H), 
+                                       (TILE_W*TILE_H + TILE_W)*sizeof(float)>>>
+    (
+      pLpv,
+      pLpg,
+      pLpw,
+      savedv,
+      savedg,
+      savedNorms,
+      fast_dim_size,
+      slower_dims_size 
+    );
+  }
+  // else 
+  // {
+  //   intermediate dim kernel.  Error checking on the dim was already done in 
+  //   Module.cpp:weight_norm_bwd.  Could put that logic here instead, if we include
+  //   <python.h> in both files.
+  // }
+
+#ifdef DEBUG_PROFILE
+  cudaDeviceSynchronize();
+#endif
+}
+
+#define INSTANTIATE_SEND_TO_FWD(DATATYPE, ACCUMTYPE, IDXTYPE)         \
+template void send_to_fwd_wrapper::call<DATATYPE, ACCUMTYPE, IDXTYPE> \
+(                                                                     \
+  vector<TensorInfo<void, idxType>>&,                                 \
+  int                                                                 \
+);
+INSTANTIATE_SEND_TO_FWD(float, float, idxType)
+INSTANTIATE_SEND_TO_FWD(half, float, idxType)
+#undef INSTANTIATE_SEND_TO_FWD
+
+#define INSTANTIATE_SEND_TO_BWD(DATATYPE, ACCUMTYPE, IDXTYPE)         \
+template void send_to_bwd_wrapper::call<DATATYPE, ACCUMTYPE, IDXTYPE> \
+(                                                                     \
+  vector<TensorInfo<void, idxType>>&,                                 \
+  int                                                                 \
+);                                                            
+INSTANTIATE_SEND_TO_BWD(float, float, idxType)
+INSTANTIATE_SEND_TO_BWD(half, float, idxType)
+#undef INSTANTIATE_SEND_TO_BWD
+
+#undef BLOCK
+#undef TILE_W
+#undef TILE_H
--- a/docs/Makefile
+++ b/docs/Makefile
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line.
+SPHINXOPTS    =
+SPHINXBUILD   = sphinx-build
+SPHINXPROJ    = PyTorch
+SOURCEDIR     = source
+BUILDDIR      = build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+docset: html
+	doc2dash --name $(SPHINXPROJ) --icon $(SOURCEDIR)/_static/img/nv-pytorch2.png --enable-js --online-redirect-url http://pytorch.org/docs/ --force $(BUILDDIR)/html/
+
+        # Manually fix because Zeal doesn't deal well with `icon.png`-only at 2x resolution.
+	cp $(SPHINXPROJ).docset/icon.png $(SPHINXPROJ).docset/icon@2x.png
+	convert $(SPHINXPROJ).docset/icon@2x.png -resize 16x16 $(SPHINXPROJ).docset/icon.png
+
+gh-pages:
+	git checkout gh-pages
+	rm -rf build
+	rm -rf source
+	git checkout master -- .
+	make html
+	rm -rf ../_modules ../_sources ../_static
+	mv -fv build/html/* ../
+	rm -rf build
+	git add -A
+	git commit -m "Generated gh-pages for `git log master -1 --pretty=short --abbrev-commit`" && git push origin gh-pages ; git checkout master
+
+.PHONY: help Makefile docset
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
--- a/docs/source/RNN.rst
+++ b/docs/source/RNN.rst
+.. role:: hidden
+    :class: hidden-section
+
+apex.RNN
+===================================
+
+This sumbodule is an in development API aimed to supply parity to torch.nn.RNN,
+but be easier to extend. This module is not ready for use and still lacks important
+features and validation.
+
+.. automodule:: apex.RNN
+.. currentmodule:: apex.RNN
+
+.. RNN
+   ----------
+
+.. autofunction:: LSTM
+
+.. autofunction:: mLSTM
+
+.. autofunction:: GRU
+
+.. autofunction:: ReLU
+
+.. autofunction:: Tanh		  
+		 
--- a/docs/source/_static/css/pytorch_theme.css
+++ b/docs/source/_static/css/pytorch_theme.css
+body {
+    font-family: "Lato","proxima-nova","Helvetica Neue",Arial,sans-serif;
+}
+
+/* Default header fonts are ugly */
+h1, h2, .rst-content .toctree-wrapper p.caption, h3, h4, h5, h6, legend, p.caption {
+    font-family: "Lato","proxima-nova","Helvetica Neue",Arial,sans-serif;
+}
+
+/* Use white for docs background */
+.wy-side-nav-search {
+    background-color: #fff;
+}
+
+.wy-nav-content-wrap, .wy-menu li.current > a  {
+    background-color: #fff;
+}
+
+@media screen and (min-width: 1400px) {
+    .wy-nav-content-wrap {
+        background-color: rgba(0, 0, 0, 0.0470588);
+    }
+
+    .wy-nav-content {
+        background-color: #fff;
+    }
+}
+
+/* Fixes for mobile */
+.wy-nav-top {
+    background-color: #fff;
+    background-image: url('../img/apex.jpg');
+    background-repeat: no-repeat;
+    background-position: center;
+    padding: 0;
+    margin: 0.4045em 0.809em;
+    color: #333;
+}
+
+.wy-nav-top > a {
+    display: none;
+}
+
+@media screen and (max-width: 768px) {
+    .wy-side-nav-search>a img.logo {
+        height: 60px;
+    }
+}
+
+/* This is needed to ensure that logo above search scales properly */
+.wy-side-nav-search a {
+    display: block;
+}
+
+/* This ensures that multiple constructors will remain in separate lines. */
+.rst-content dl:not(.docutils) dt {
+    display: table;
+}
+
+/* Use our red for literals (it's very similar to the original color) */
+.rst-content tt.literal, .rst-content tt.literal, .rst-content code.literal {
+    color: #F05732;
+}
+
+.rst-content tt.xref, a .rst-content tt, .rst-content tt.xref,
+.rst-content code.xref, a .rst-content tt, a .rst-content code {
+    color: #404040;
+}
+
+/* Change link colors (except for the menu) */
+
+a {
+    color: #F05732;
+}
+
+a:hover {
+    color: #F05732;
+}
+
+
+a:visited {
+    color: #D44D2C;
+}
+
+.wy-menu a {
+    color: #b3b3b3;
+}
+
+.wy-menu a:hover {
+    color: #b3b3b3;
+}
+
+/* Default footer text is quite big */
+footer {
+    font-size: 80%;
+}
+
+footer .rst-footer-buttons {
+    font-size: 125%; /* revert footer settings - 1/80% = 125% */
+}
+
+footer p {
+    font-size: 100%;
+}
+
+/* For hidden headers that appear in TOC tree */
+/* see http://stackoverflow.com/a/32363545/3343043 */
+.rst-content .hidden-section {
+    display: none;
+}
+
+nav .hidden-section {
+    display: inherit;
+}
+
+.wy-side-nav-search>div.version {
+    color: #000;
+}
--- a/docs/source/_static/img/nv-pytorch2.png
+++ b/docs/source/_static/img/nv-pytorch2.png
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+#
+# PyTorch documentation build configuration file, created by
+# sphinx-quickstart on Fri Dec 23 13:31:47 2016.
+#
+# This file is execfile()d with the current directory set to its
+# containing dir.
+#
+# Note that not all possible configuration values are present in this
+# autogenerated file.
+#
+# All configuration values have a default; values that are commented out
+# serve to show the default.
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#
+import os
+import sys
+sys.path.insert(0, os.path.abspath('.'))
+# sys.path.insert(0, os.path.abspath('../../apex/parallel/'))
+import apex
+# import multiproc
+import sphinx_rtd_theme
+
+
+# -- General configuration ------------------------------------------------
+
+# If your documentation needs a minimal Sphinx version, state it here.
+#
+# needs_sphinx = '1.0'
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = [
+    'sphinx.ext.autodoc',
+    'sphinx.ext.autosummary',
+    'sphinx.ext.doctest',
+    'sphinx.ext.intersphinx',
+    'sphinx.ext.todo',
+    'sphinx.ext.coverage',
+    'sphinx.ext.mathjax',
+    'sphinx.ext.napoleon',
+    'sphinx.ext.viewcode',
+]
+
+napoleon_use_ivar = True
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# The suffix(es) of source filenames.
+# You can specify multiple suffix as a list of string:
+#
+# source_suffix = ['.rst', '.md']
+source_suffix = '.rst'
+
+# The master toctree document.
+master_doc = 'index'
+
+# General information about the project.
+project = 'APEx'
+copyright = '2018'
+author = 'Christian Sarofeen, Natalia Gimelshein, Michael Carilli, Raul Puri'
+
+# The version info for the project you're documenting, acts as replacement for
+# |version| and |release|, also used in various other places throughout the
+# built documents.
+#
+# The short X.Y version.
+# TODO: change to [:2] at v1.0
+# version = 'master (' + torch.__version__ + ' )'
+version = '0.0'
+# The full version, including alpha/beta/rc tags.
+# TODO: verify this works as expected
+release = '0.0.0'
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#
+# This is also used if you do content translation via gettext catalogs.
+# Usually you set "language" from the command line for these cases.
+language = None
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This patterns also effect to html_static_path and html_extra_path
+exclude_patterns = []
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = 'sphinx'
+
+# If true, `todo` and `todoList` produce output, else they produce nothing.
+todo_include_todos = True
+
+
+# -- Options for HTML output ----------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+#
+html_theme = 'sphinx_rtd_theme'
+html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
+
+# Theme options are theme-specific and customize the look and feel of a theme
+# further.  For a list of options available for each theme, see the
+# documentation.
+#
+html_theme_options = {
+    'collapse_navigation': False,
+    'display_version': True,
+    'logo_only': True,
+}
+
+html_logo = '_static/img/nv-pytorch2.png'
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
+
+# html_style_path = 'css/pytorch_theme.css'
+html_context = {
+    'css_files': [
+        'https://fonts.googleapis.com/css?family=Lato',
+        '_static/css/pytorch_theme.css'
+    ],
+}
+
+
+# -- Options for HTMLHelp output ------------------------------------------
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = 'PyTorchdoc'
+
+
+# -- Options for LaTeX output ---------------------------------------------
+
+latex_elements = {
+    # The paper size ('letterpaper' or 'a4paper').
+    #
+    # 'papersize': 'letterpaper',
+
+    # The font size ('10pt', '11pt' or '12pt').
+    #
+    # 'pointsize': '10pt',
+
+    # Additional stuff for the LaTeX preamble.
+    #
+    # 'preamble': '',
+
+    # Latex figure (float) alignment
+    #
+    # 'figure_align': 'htbp',
+}
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title,
+#  author, documentclass [howto, manual, or own class]).
+latex_documents = [
+    (master_doc, 'apex.tex', 'APEx Documentation',
+     'Torch Contributors', 'manual'),
+]
+
+
+# -- Options for manual page output ---------------------------------------
+
+# One entry per manual page. List of tuples
+# (source start file, name, description, authors, manual section).
+man_pages = [
+    (master_doc, 'APEx', 'APEx Documentation',
+     [author], 1)
+]
+
+
+# -- Options for Texinfo output -------------------------------------------
+
+# Grouping the document tree into Texinfo files. List of tuples
+# (source start file, target name, title, author,
+#  dir menu entry, description, category)
+texinfo_documents = [
+    (master_doc, 'APEx', 'APEx Documentation',
+     author, 'APEx', 'One line description of project.',
+     'Miscellaneous'),
+]
+
+
+# Example configuration for intersphinx: refer to the Python standard library.
+intersphinx_mapping = {
+    'python': ('https://docs.python.org/', None),
+    'numpy': ('http://docs.scipy.org/doc/numpy/', None),
+}
+
+# -- A patch that prevents Sphinx from cross-referencing ivar tags -------
+# See http://stackoverflow.com/a/41184353/3343043
+
+from docutils import nodes
+from sphinx.util.docfields import TypedField
+from sphinx import addnodes
+
+
+def patched_make_field(self, types, domain, items, **kw):
+    # `kw` catches `env=None` needed for newer sphinx while maintaining
+    #  backwards compatibility when passed along further down!
+
+    # type: (List, unicode, Tuple) -> nodes.field
+    def handle_item(fieldarg, content):
+        par = nodes.paragraph()
+        par += addnodes.literal_strong('', fieldarg)  # Patch: this line added
+        # par.extend(self.make_xrefs(self.rolename, domain, fieldarg,
+        #                           addnodes.literal_strong))
+        if fieldarg in types:
+            par += nodes.Text(' (')
+            # NOTE: using .pop() here to prevent a single type node to be
+            # inserted twice into the doctree, which leads to
+            # inconsistencies later when references are resolved
+            fieldtype = types.pop(fieldarg)
+            if len(fieldtype) == 1 and isinstance(fieldtype[0], nodes.Text):
+                typename = u''.join(n.astext() for n in fieldtype)
+                typename = typename.replace('int', 'python:int')
+                typename = typename.replace('long', 'python:long')
+                typename = typename.replace('float', 'python:float')
+                typename = typename.replace('type', 'python:type')
+                par.extend(self.make_xrefs(self.typerolename, domain, typename,
+                                           addnodes.literal_emphasis, **kw))
+            else:
+                par += fieldtype
+            par += nodes.Text(')')
+        par += nodes.Text(' -- ')
+        par += content
+        return par
+
+    fieldname = nodes.field_name('', self.label)
+    if len(items) == 1 and self.can_collapse:
+        fieldarg, content = items[0]
+        bodynode = handle_item(fieldarg, content)
+    else:
+        bodynode = self.list_type()
+        for fieldarg, content in items:
+            bodynode += nodes.list_item('', handle_item(fieldarg, content))
+    fieldbody = nodes.field_body('', bodynode)
+    return nodes.field('', fieldname, fieldbody)
+
+TypedField.make_field = patched_make_field
--- a/docs/source/fp16_utils.rst
+++ b/docs/source/fp16_utils.rst
+.. role:: hidden
+    :class: hidden-section
+
+apex.fp16_utils
+===================================
+
+This submodule contains utilities designed to streamline the mixed precision training recipe 
+presented by NVIDIA `on Parallel Forall`_ and in GTC 2018 Sessions 
+`Training Neural Networks with Mixed Precision: Theory and Practice`_ and 
+`Training Neural Networks with Mixed Precision: Real Examples`_.
+For Pytorch users, Real Examples in particular is recommended.
+
+.. _`on Parallel Forall`:
+    https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/
+.. _`Training Neural Networks with Mixed Precision: Theory and Practice`:
+    http://on-demand.gputechconf.com/gtc/2018/video/S8923/
+.. _`Training Neural Networks with Mixed Precision: Real Examples`:
+    http://on-demand.gputechconf.com/gtc/2018/video/S81012/
+
+.. automodule:: apex.fp16_utils
+.. currentmodule:: apex.fp16_utils
+
+.. FusedNorm
+   ----------
+
+.. autofunction:: prep_param_lists
+
+.. autofunction:: master_params_to_model_params
+
+.. autofunction:: model_grads_to_master_grads
+
+.. autoclass:: FP16_Optimizer
+    :members:
+
+.. autoclass:: Fused_Weight_Norm
+    :members:
+
+.. .. automodule:: apex.fp16_utils.loss_scaler
+
+.. autoclass:: LossScaler
+    :members:
+
+.. autoclass:: DynamicLossScaler
+    :members:
+
+.. .. automodule:: apex.fp16_utils.fp16util
+    :members:
+
+
+
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
+.. PyTorch documentation master file, created by
+   sphinx-quickstart on Fri Dec 23 13:31:47 2016.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+
+:github_url: https://gitlab-master.nvidia.com/csarofeen/apex
+
+APEx (A PyTorch Extension)
+===================================
+
+This is a repo is designed to hold PyTorch modules and utilities that are under active development and experimental. This repo is not designed as a long term solution or a production solution. Things placed in here are intended to be eventually moved to upstream PyTorch.
+
+A major focus of this extension is the training of neural networks using 16-bit precision floating point math, which offers significant performance benefits on latest NVIDIA GPU architectures. The reduced dynamic range of half precision, however, is more vulnerable to numerical overflow/underflow.
+
+APEX is an NVIDIA-maintained repository of utilities, including some that are targeted to improve the accuracy and stability of half precision networks, while maintaining high performance. The utilities are designed to be minimally invasive and easy to use.
+
+Installation requires CUDA9, PyTorch 0.3 or later, and Python 3. Installation can be done by running
+::
+  git clone https://www.github.com/nvidia/apex
+  cd apex
+  python setup.py install
+
+	       
+
+.. toctree::
+   :maxdepth: 1
+   :caption: apex
+
+   parallel
+   reparameterization
+   RNN
+   fp16_utils
+   
+Indices and tables
+==================
+
+* :ref:`genindex`
+* :ref:`modindex`
--- a/docs/source/parallel.rst
+++ b/docs/source/parallel.rst
+.. role:: hidden
+    :class: hidden-section
+
+apex.parallel
+===================================
+
+.. automodule:: apex.parallel
+.. currentmodule:: apex.parallel
+
+Still need to figure out how to document multiproc.py.
+
+.. DistributedDataParallel
+   ----------
+
+.. autoclass:: DistributedDataParallel
+    :members:
--- a/docs/source/reparameterization.rst
+++ b/docs/source/reparameterization.rst
+.. role:: hidden
+    :class: hidden-section
+
+apex.reparameterization
+===================================
+
+.. automodule:: apex.reparameterization
+.. currentmodule:: apex.reparameterization
+
+.. autoclass:: Reparameterization
+    :members:
+
+.. autoclass:: WeightNorm
+    :members:
--- a/examples/distributed/README.md
+++ b/examples/distributed/README.md
+# Basic Multirpocess Example based on the MNIST example
+
+This version of this examples requires APEx which can be installed from https://www.github.com/nvidia/apex. This example demonstrates how to modify a network to use a basic but effective distributed data parallel module. This parallel method is designed to easily run multi-gpu runs on a single node. It was created as current parallel methods integraded into pytorch can induce significant overhead due to python GIL lock. This method will reduce the influence of those overheads and potentially provide a benefit in performance, especially for networks with a significant number of fast running operations.
+
+## Getting started
+Prior to running please run
+```pip install -r requirements.txt```
+
+and start a single process run to allow the dataset to be downloaded (This will not work properly in multi-gpu. You can stop this job as soon as it starts iterating.).
+```python main.py```
+
+You can now the code multi-gpu with
+```python -m apex.parallelmultiproc main.py ...```
+adding any normal option you'd like.
+
+## Converting your own model
+To understand how to convert your own model to use the distributed module included, please see all sections of main.py within ```#=====START: ADDED FOR DISTRIBUTED======``` and ```#=====END:   ADDED FOR DISTRIBUTED======``` flags.
+
+## Requirements
+Pytorch master branch built from source. This requirement is to use NCCL as a distributed backend.
+APEx installed from https://www.github.com/nvidia/apex
\ No newline at end of file
--- a/examples/distributed/main.py
+++ b/examples/distributed/main.py
+from __future__ import print_function
+import argparse
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+from torchvision import datasets, transforms
+from torch.autograd import Variable
+
+#=====START: ADDED FOR DISTRIBUTED======
+'''Add custom module for distributed'''
+
+try:
+    from apex.parallel import DistributedDataParallel as DDP
+except ImportError:
+    raise ImportError("Please install apex from https://www.github.com/nvidia/apex to run this example.")
+
+'''Import distributed data loader'''
+import torch.utils.data
+import torch.utils.data.distributed
+
+'''Import torch.distributed'''
+import torch.distributed as dist
+
+#=====END:   ADDED FOR DISTRIBUTED======
+
+# Training settings
+parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
+parser.add_argument('--batch-size', type=int, default=64, metavar='N',
+                    help='input batch size for training (default: 64)')
+parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N',
+                    help='input batch size for testing (default: 1000)')
+parser.add_argument('--epochs', type=int, default=10, metavar='N',
+                    help='number of epochs to train (default: 10)')
+parser.add_argument('--lr', type=float, default=0.01, metavar='LR',
+                    help='learning rate (default: 0.01)')
+parser.add_argument('--momentum', type=float, default=0.5, metavar='M',
+                    help='SGD momentum (default: 0.5)')
+parser.add_argument('--no-cuda', action='store_true', default=False,
+                    help='disables CUDA training')
+parser.add_argument('--seed', type=int, default=1, metavar='S',
+                    help='random seed (default: 1)')
+parser.add_argument('--log-interval', type=int, default=10, metavar='N',
+                    help='how many batches to wait before logging training status')
+
+#======START: ADDED FOR DISTRIBUTED======
+'''
+Add some distributed options. For explanation of dist-url and dist-backend please see
+http://pytorch.org/tutorials/intermediate/dist_tuto.html
+
+--world-size and --rank are required parameters as they will be used by the multiproc.py launcher
+but do not have to be set explicitly.
+'''
+
+parser.add_argument('--dist-url', default='tcp://224.66.41.62:23456', type=str,
+                    help='url used to set up distributed training')
+parser.add_argument('--dist-backend', default='nccl', type=str,
+                    help='distributed backend')
+parser.add_argument('--world-size', default=1, type=int,
+                    help='Number of GPUs to use. Can either be manually set ' +
+                    'or automatically set by using \'python -m multiproc\'.')
+parser.add_argument('--rank', default=0, type=int,
+                    help='Used for multi-process training. Can either be manually set ' +
+                    'or automatically set by using \'python -m multiproc\'.')
+#=====END:   ADDED FOR DISTRIBUTED======
+
+args = parser.parse_args()
+args.cuda = not args.no_cuda and torch.cuda.is_available()
+
+#======START: ADDED FOR DISTRIBUTED======
+'''Add a convenience flag to see if we are running distributed'''
+args.distributed = args.world_size > 1
+
+'''Check that we are running with cuda, as distributed is only supported for cuda.'''
+if args.distributed:
+    assert args.cuda, "Distributed mode requires running with CUDA."
+
+if args.distributed:
+    '''
+    Set cuda device so everything is done on the right GPU.
+    THIS MUST BE DONE AS SOON AS POSSIBLE.
+    '''
+    torch.cuda.set_device(args.rank % torch.cuda.device_count())
+
+    '''Initialize distributed communication'''
+    dist.init_process_group(args.dist_backend, init_method=args.dist_url,
+                            world_size=args.world_size)
+
+#=====END:   ADDED FOR DISTRIBUTED======
+
+torch.manual_seed(args.seed)
+if args.cuda:
+    torch.cuda.manual_seed(args.seed)
+
+
+kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {}
+
+#=====START: ADDED FOR DISTRIBUTED======
+'''
+Change sampler to distributed if running distributed.
+Shuffle data loader only if distributed.
+'''
+train_dataset = datasets.MNIST('../data', train=True, download=True,
+                               transform=transforms.Compose([
+                                   transforms.ToTensor(),
+                                   transforms.Normalize((0.1307,), (0.3081,))
+                               ]))
+
+if args.distributed:
+    train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
+else:
+    train_sampler = None
+
+train_loader = torch.utils.data.DataLoader(
+    train_dataset, sampler=train_sampler,
+    batch_size=args.batch_size, shuffle=(train_sampler is None), **kwargs
+)
+
+#=====END:   ADDED FOR DISTRIBUTED======
+
+test_loader = torch.utils.data.DataLoader(
+    datasets.MNIST('../data', train=False, transform=transforms.Compose([
+                       transforms.ToTensor(),
+                       transforms.Normalize((0.1307,), (0.3081,))
+                   ])),
+    batch_size=args.test_batch_size, shuffle=True, **kwargs)
+
+
+class Net(nn.Module):
+    def __init__(self):
+        super(Net, self).__init__()
+        self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
+        self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
+        self.conv2_drop = nn.Dropout2d()
+        self.fc1 = nn.Linear(320, 50)
+        self.fc2 = nn.Linear(50, 10)
+
+    def forward(self, x):
+        x = F.relu(F.max_pool2d(self.conv1(x), 2))
+        x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
+        x = x.view(-1, 320)
+        x = F.relu(self.fc1(x))
+        x = F.dropout(x, training=self.training)
+        x = self.fc2(x)
+        return F.log_softmax(x)
+
+model = Net()
+if args.cuda:
+    model.cuda()
+
+#=====START: ADDED FOR DISTRIBUTED======
+'''
+Wrap model in our version of DistributedDataParallel.
+This must be done AFTER the model is converted to cuda.
+'''
+
+if args.distributed:
+    model = DDP(model)
+#=====END:   ADDED FOR DISTRIBUTED======
+
+optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum)
+
+def train(epoch):
+    model.train()
+    for batch_idx, (data, target) in enumerate(train_loader):
+        if args.cuda:
+            data, target = data.cuda(), target.cuda()
+        data, target = Variable(data), Variable(target)
+        optimizer.zero_grad()
+        output = model(data)
+        loss = F.nll_loss(output, target)
+        loss.backward()
+        optimizer.step()
+        if batch_idx % args.log_interval == 0:
+            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
+                epoch, batch_idx * len(data), len(train_loader.dataset),
+                100. * batch_idx / len(train_loader), loss.data[0]))
+
+def test():
+    model.eval()
+    test_loss = 0
+    correct = 0
+    for data, target in test_loader:
+        if args.cuda:
+            data, target = data.cuda(), target.cuda()
+        data, target = Variable(data, volatile=True), Variable(target)
+        output = model(data)
+        test_loss += F.nll_loss(output, target, size_average=False).data[0] # sum up batch loss
+        pred = output.data.max(1, keepdim=True)[1] # get the index of the max log-probability
+        correct += pred.eq(target.data.view_as(pred)).cpu().sum()
+
+    test_loss /= len(test_loader.dataset)
+    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
+        test_loss, correct, len(test_loader.dataset),
+        100. * correct / len(test_loader.dataset)))
+
+
+for epoch in range(1, args.epochs + 1):
+    train(epoch)
+    test()
--- a/examples/distributed/requirements.txt
+++ b/examples/distributed/requirements.txt
+torch
+torchvision
--- a/examples/distributed/run_distributed.sh
+++ b/examples/distributed/run_distributed.sh
+python -m apex.parallel.multiproc main.py
--- a/include/THCTensorInfo.cuh
+++ b/include/THCTensorInfo.cuh
+#ifndef THC_TENSOR_INFO_INC
+#define THC_TENSOR_INFO_INC
+
+#include <cuda.h>
+#include <cuda_fp16.h>
+#include <assert.h>
+
+// Maximum number of dimensions allowed for cutorch
+#define MAX_CUTORCH_DIMS 10
+
+// Warning string for tensor arguments that are too large or have too
+// many dimensions
+#define CUTORCH_STR(X) #X
+#define CUTORCH_DIM_WARNING "tensor too large or too many (>" \
+  CUTORCH_STR(MAX_CUTORCH_DIMS) ") dimensions"
+
+enum float_types { FLOAT = 0 , HALF = 1, DOUBLE = 2 };
+
+// CUDA kernel argument that defines tensor layout
+template <typename T, typename IndexType>
+struct TensorInfo {
+  
+  TensorInfo(T* p,
+             int dim,
+             IndexType sz[MAX_CUTORCH_DIMS],
+             IndexType st[MAX_CUTORCH_DIMS]);
+  
+  TensorInfo(T* p,
+	     int dim,
+	     IndexType sz[MAX_CUTORCH_DIMS],
+	     IndexType st[MAX_CUTORCH_DIMS],
+	     float_types type);
+
+  //Good way to cast from another format
+  //template <TensorInfo<typename T2, typename I2> >
+  //TensorInfo(TensorInfo<T2, I2> &tinfo_in){
+  //  data = reinterpret_cast<T*>(tinfo_in.data);
+  //}
+  
+  T* data;
+  IndexType sizes[MAX_CUTORCH_DIMS];
+  IndexType strides[MAX_CUTORCH_DIMS];
+  int dims;
+  float_types type;
+};
+
+//Expand our combinations as convenient typedefs
+typedef TensorInfo<half, int> t_hi;
+typedef TensorInfo<half, long> t_hl;
+typedef TensorInfo<float, int> t_fi;
+typedef TensorInfo<float, long> t_fl;
+
+
+template <typename T, typename IndexType>
+TensorInfo<T, IndexType>::TensorInfo(T* p,
+                                     int dim,
+                                     IndexType sz[MAX_CUTORCH_DIMS],
+                                     IndexType st[MAX_CUTORCH_DIMS]) {
+  data = p;
+  dims = dim;
+  assert(dims > 0 && dims < MAX_CUTORCH_DIMS);
+
+  for (int i = 0; i < dim; ++i) {
+    sizes[i] = sz[i];
+    strides[i] = st[i];
+  }
+}
+
+template<typename T, typename IndexType>
+TensorInfo<T, IndexType>::TensorInfo(T* p,
+	   int dim,
+	   IndexType sz[MAX_CUTORCH_DIMS],
+	   IndexType st[MAX_CUTORCH_DIMS],
+	   float_types _type){
+  data = p;
+  dims = dim;
+  assert(dims > 0 && dims < MAX_CUTORCH_DIMS);
+
+  for (int i = 0; i < dim; ++i) {
+    sizes[i] = sz[i];
+    strides[i] = st[i];
+  }
+  type=_type;
+}
+
+
+
+// Translate a linear index for the apply to a T* offset;
+// specialized on `Dims` to reduce nvcc compilation time
+template <typename T, typename IndexType, int Dims>
+struct IndexToOffset {
+  static __forceinline__ __host__ __device__ IndexType get(
+					   IndexType linearId,
+					   const TensorInfo<T, IndexType>& info) {
+    IndexType offset = 0;
+    
+    // Use static dims
+    for (int i = Dims - 1; i > 0; --i) {
+      for (int i = Dims - 1; i > 0; --i) {
+	offset += linearId % info.sizes[i] * info.strides[i];
+	linearId /= info.sizes[i];
+      }
+      
+      offset += linearId * info.strides[0];
+      return offset;
+    }
+  }
+};
+  
+
+  
+// For contiguous tensors, the offset = index
+template <typename T, typename IndexType>
+struct IndexToOffset<T, IndexType, -2> {
+  static __forceinline__ __host__ __device__ IndexType
+    get(IndexType linearId, const TensorInfo<T, IndexType>& info) {
+    return linearId;
+  }
+};
+
+template <typename T, typename IndexType>
+struct IndexToOffset<T, IndexType, -1> {
+  static __forceinline__ __host__ __device__ IndexType get(
+    IndexType linearId,
+    const TensorInfo<T, IndexType>& info) {
+
+    IndexType offset = 0;
+
+    // Use dynamic dims
+    for (int i = info.dims - 1; i >= 0; --i) {
+      IndexType curDimIndex = linearId % info.sizes[i];
+      IndexType curDimOffset = curDimIndex * info.strides[i];
+      offset += curDimOffset;
+
+      linearId /= info.sizes[i];
+    }
+
+    return offset;
+  }
+};
+
+#endif // THC_TENSOR_INFO_INC
--- a/include/kernel.h
+++ b/include/kernel.h
+#include "THCTensorInfo.cuh"
+#include <iostream>
+#include <cstdio>
+#include <cassert>
+#include <cuda.h>
+// this is suboptimal, try forward declarations later
+#include <vector>
+
+#define Dims -2
+#define DEVICE_LINEAR_GET(D_TENSOR, INDEX) D_TENSOR.data[IndexToOffset<T, IndexType, Dims>::get(INDEX, D_TENSOR)]
+#define DEVICE_LINEAR_GET_F(D_TENSOR, INDEX) D_TENSOR.data[IndexToOffset<float, IndexType, Dims>::get(INDEX, D_TENSOR)]
+
+// template <typename T, typename IndexType>
+// void send_to_kernel(
+//                     TensorInfo<T, IndexType> Input_1,
+//                     TensorInfo<T, IndexType> Input_2,
+//                     IndexType totalElems
+//                     );
+
+typedef int idxType;
+
+struct send_to_fwd_wrapper
+{
+  template<typename DataType, 
+           typename AccumType, 
+           typename IndexType>
+  static void call(std::vector<TensorInfo<void, idxType>>& tensors, int dim);
+};
+
+struct send_to_bwd_wrapper
+{
+  template<typename DataType,
+           typename AccumType,
+           typename IndexType>
+  static void call(std::vector<TensorInfo<void, idxType>>& tensors, int dim);
+};
+
+template <typename In, typename Out>
+struct ScalarConvert {
+  static __host__ __device__ __forceinline__ Out to(const In v) { return (Out) v; }
+};
+
+#ifdef CUDA_HALF_TENSOR
+template <typename Out>
+struct ScalarConvert<half, Out> {
+  static __host__ __device__ __forceinline__ Out to(const half v) {
+#ifdef __CUDA_ARCH__
+    return (Out) __half2float(v);
+#else
+    return (Out) THC_half2float(v);
+#endif
+  }
+};
+
+template <typename In>
+struct ScalarConvert<In, half> {
+  static __host__ __device__ __forceinline__ half to(const In v) {
+#ifdef __CUDA_ARCH__
+    return __float2half((float) v);
+#else
+    return THC_float2half((float) v);
+#endif
+  }
+};
+
+template <>
+struct ScalarConvert<half, half> {
+  static __host__ __device__ __forceinline__ half to(const half v) {
+    return v;
+  }
+};
+
+#endif
--- a/setup.py
+++ b/setup.py
+import re
+
+import os
+import shutil
+import inspect
+
+import distutils
+import distutils.spawn
+from distutils.command.clean import clean
+
+from setuptools import setup, Extension, find_packages
+from setuptools.command.install import install
+
+import subprocess
+import ctypes.util
+
+import torch
+
+#Takes a path to walk
+#A function to decide if to keep
+#collection if we want a list of all occurances
+def find(path, regex_func, collect=False):
+    collection = [] if collect else None
+    for root, dirs, files in os.walk(path):
+        for file in files:
+            if regex_func(file):
+                if collect:
+                    collection.append(os.path.join(root, file))
+                else:
+                    return os.path.join(root, file)
+    return list(set(collection))
+
+def findcuda():
+    CUDA_HOME = os.getenv('CUDA_HOME', '/usr/local/cuda')
+    if not os.path.exists(CUDA_HOME):
+        # We use nvcc path on Linux and cudart path on macOS
+        osname = platform.system()
+        if osname == 'Linux':
+            cuda_path = find_nvcc()
+        else:
+            cudart_path = ctypes.util.find_library('cudart')
+            if cudart_path is not None:
+                cuda_path = os.path.dirname(cudart_path)
+            else:
+                cuda_path = None
+        if cuda_path is not None:
+            CUDA_HOME = os.path.dirname(cuda_path)
+        else:
+            CUDA_HOME = None
+    WITH_CUDA = CUDA_HOME is not None
+    return CUDA_HOME
+
+#Get some important paths
+curdir = os.path.dirname(os.path.abspath(inspect.stack()[0][1]))
+buildir = curdir+os.sep+"build"
+if not os.path.exists(buildir):
+    os.makedirs(buildir)
+
+torch_dir = os.path.split(torch.__file__)[0] + os.sep + "lib"
+
+cuda_files = find(curdir, lambda file: file.endswith(".cu"), True)
+cuda_headers = find(curdir, lambda file: file.endswith(".cuh"), True)
+headers = find(curdir, lambda file: file.endswith(".h"), True)
+
+libaten = find(torch_dir, re.compile("libaten", re.IGNORECASE).search, False)
+aten_h = find(torch_dir, re.compile("aten.h", re.IGNORECASE).search, False)
+
+include_dirs = [os.path.dirname(os.path.dirname(aten_h))]
+library_dirs = []
+for file in cuda_headers+headers:
+    dir = os.path.dirname(file)
+    if dir not in include_dirs:
+        include_dirs.append(dir)
+
+assert libaten, "Could not find PyTorch's libATen."
+assert aten_h, "Could not find PyTorch's ATen header."
+
+library_dirs.append(os.path.dirname(libaten))
+
+#create some places to collect important things
+object_files = []
+extra_link_args=[]
+main_libraries = []
+main_libraries += ['cudart', 'cuda', 'ATen']
+extra_compile_args = ["--std=c++11",]
+
+#findcuda returns root dir of CUDA
+#include cuda/include and cuda/lib64 for python module build.
+CUDA_HOME=findcuda()
+library_dirs.append(os.path.join(CUDA_HOME, "lib64"))
+include_dirs.append(os.path.join(CUDA_HOME, 'include'))
+
+class RMBuild(clean):
+    def run(self):
+        #BE VERY CAUTIOUS WHEN USING RMTREE!!!
+        #These are some carefully written/crafted directories
+        if os.path.exists(buildir):
+            shutil.rmtree(buildir)
+            
+        distdir = curdir+os.sep+"dist"
+        if os.path.exists(distdir):
+            shutil.rmtree(distdir)
+
+        eggdir = curdir+os.sep+"apex.egg-info"
+        if os.path.exists(eggdir):
+            shutil.rmtree(eggdir)
+        clean.run(self)
+
+def CompileCudaFiles():
+
+        print()
+        print("Compiling cuda modules with nvcc:")
+        #Need arches to compile for. Compiles for 70 which requires CUDA9
+        nvcc_cmd = ['nvcc', 
+                    '-Xcompiler', 
+                    '-fPIC',
+                    '-gencode', 'arch=compute_52,code=sm_52',
+                    '-gencode', 'arch=compute_60,code=sm_60',
+                    '-gencode', 'arch=compute_61,code=sm_61',
+                    '-gencode', 'arch=compute_70,code=sm_70',
+                    '-gencode', 'arch=compute_70,code=compute_70',
+                    '--std=c++11',
+                    '-O3',
+        ]
+        
+        for dir in include_dirs:
+            nvcc_cmd.append("-I"+dir)
+
+        for file in cuda_files:
+            object_name = os.path.basename(
+                os.path.splitext(file)[0]+".o"
+            )
+    
+            object_file = os.path.join(buildir, object_name)
+            object_files.append(object_file)
+    
+            file_opts = ['-c', file, '-o', object_file]
+            
+            print(' '.join(nvcc_cmd+file_opts))
+            subprocess.check_call(nvcc_cmd+file_opts)
+            
+        for object_file in object_files:
+            extra_link_args.append(object_file)
+
+print()
+print("Arguments used to build CUDA extension:")
+print("extra_compile_args :", extra_compile_args)
+print("include_dirs: ", include_dirs)
+print("extra_link_args: ", extra_link_args)
+print("library_dirs: ", library_dirs)
+print("libraries: ", main_libraries)
+print()
+CompileCudaFiles()
+
+print("Building CUDA extension.")
+cuda_ext = Extension('apex._C',
+                 [os.path.join('csrc', 'Module.cpp')],
+                 extra_compile_args = extra_compile_args,
+                 include_dirs=include_dirs,
+                 extra_link_args=extra_link_args,
+                 library_dirs=library_dirs,
+                 runtime_library_dirs = library_dirs,
+                 libraries=main_libraries
+      )
+
+print("Building module.")
+setup(
+    name='apex', version='0.1',
+    cmdclass={
+        'clean' : RMBuild,
+    },  
+    ext_modules=[cuda_ext,],
+    description='PyTorch Extensions written by NVIDIA',
+    packages=find_packages(exclude=("build", "csrc", "include", "tests")),
+)
--- a/tests/RNN/RNN_tests.py
+++ b/tests/RNN/RNN_tests.py
+import torch
+import torch.nn as nn
+from torch.autograd import Variable
+import apex
+from apex.RNN.models import bidirectionalRNN, stackedRNN, RNNCell
+from torch.nn._functions.rnn import LSTMCell
+import itertools
+
+
+torch.backends.cudnn.enabled=False
+
+batch_first = False #not implemented yet
+dropout = 0.0 #How to validate?
+bidirectional = False #True works, but differs in definition to PyTorch
+
+rnn_types = ['LSTM', 'GRU', 'ReLU', 'Tanh']
+sizes = [8,4,2]
+
+seq_sizes = sizes
+hidden_sizes = sizes
+inp_sizes = sizes
+batch_sizes = sizes
+num_layerss = sizes
+
+biases = [True]
+
+def copy_param_set(pyt_rnn, my_rnn, layer=0, reverse=False):
+    my_params = None
+
+    rnn = None
+    if isinstance(my_rnn, bidirectionalRNN):
+        rnn = my_rnn.fwd.rnns[layer] if not reverse else my_rnn.bckwrd.rnns[layer]
+    elif isinstance(my_rnn, stackedRNN):
+        rnn = my_rnn.rnns[layer]
+    else:
+        raise RuntimeError()
+
+    param_names = ['w_ih', 'w_hh', 'b_ih', 'b_hh']
+
+    if not hasattr(rnn, 'b_hh'):
+        param_names = param_names[:2]
+    my_params = [getattr(rnn, param_name) for param_name in param_names]
+        
+    pyt_params = None
+    param_names = ['weight_ih_', 'weight_hh_', 'bias_ih_', 'bias_hh_']
+    reverse_str = '_reverse' if reverse else ''
+
+    if not hasattr(pyt_rnn, 'bias_hh_l0'):
+        param_names=param_names[:2]
+    pyt_params =[getattr(pyt_rnn, param_name + 'l' + str(layer) + reverse_str )
+                 for param_name in param_names ]
+    for pyt_param, my_param in zip(pyt_params, my_params):
+        pyt_param.data.copy_(my_param.data)
+
+def copy_all_params(pyt_rnn, my_rnn):
+    for layer in range(num_layers):
+        copy_param_set(pyt_rnn, my_rnn, layer)
+        if bidirectional:
+            copy_param_set(pyt_rnn, my_rnn, layer, bidirectional)
+
+
+def compare_variables(v1, v2, msg, params):
+    diff = float((v1.data-v2.data).abs().max())
+    if diff > 1e-5:
+        print("Error of ", diff, " found for ", msg, " for case: ", str(params))
+    
+def compare_tuple_variables(t1, t2, msg, params):
+    for var1, var2 in zip(t1, t2):
+        compare_variables(var1, var2, msg, params)
+
+def maybe_compare(v1, v2, msg, params):
+    if isinstance(v1, Variable) and isinstance(v2, Variable):
+        compare_variables(v1, v2, msg, params)
+    else:
+        compare_tuple_variables(v1, v2, msg, params)
+
+product = list(itertools.product(rnn_types, seq_sizes, hidden_sizes, inp_sizes, batch_sizes, num_layerss, biases))
+
+for test_case in product:
+    rnn_type, seq_size, hidden_size, inp_size, batch_size, num_layers, bias = test_case
+
+    inp = torch.cuda.FloatTensor(seq_size, batch_size, inp_size).uniform_()
+
+    if rnn_type == 'ReLU' or rnn_type == 'Tanh':
+        pytorch_rnn = nn.RNN(inp_size, hidden_size, num_layers, bias, batch_first, dropout, bidirectional, nonlinearity=rnn_type.lower()).cuda()
+    else:
+        pytorch_rnn =     getattr(nn, rnn_type)(inp_size, hidden_size, num_layers, bias, batch_first, dropout, bidirectional).cuda()
+    my_rnn = getattr(apex.RNN.models, rnn_type)(inp_size, hidden_size, num_layers, bias, batch_first, dropout, bidirectional).cuda()
+    
+    copy_all_params(pytorch_rnn, my_rnn)
+
+    pyt_inp = Variable(inp, requires_grad=True)
+    my_inp  = Variable(inp, requires_grad=True)
+
+    my_out, my_hiddens =  my_rnn(my_inp)
+    pyt_out, pyt_hiddens = pytorch_rnn(pyt_inp)
+
+    pyt_out.sum().backward()
+    my_out.sum().backward()
+
+
+    maybe_compare(pyt_out, my_out, "out", test_case)
+
+    #If there's only one hidden state PyTorch doesn't return it in a tuple,
+    #apex does, so we wrap PyTorch's returned hidden state in a tuple.
+    if not isinstance(pyt_hiddens, tuple):
+        pyt_hiddens = (pyt_hiddens,)
+
+    try:
+        for i, (pyt_hid, my_hid) in enumerate(zip(pyt_hiddens, my_hiddens)):
+            maybe_compare(pyt_hid, my_hid , "hx_"+str(i), test_case)
+    except ValueError:
+        maybe_compare(pyt_hiddens, my_hiddens , "hx_0", test_case)
+        
+        
+    maybe_compare(pyt_inp.grad, my_inp.grad, "inp.grad", test_case)
+
+print("Test passed.")
--- a/tests/raw_ops/backward_gotchas.py
+++ b/tests/raw_ops/backward_gotchas.py
+import torch
+from torch.autograd import Variable
+# import apex
+import numpy as np
+
+torch.manual_seed(2)
+torch.cuda.manual_seed(2)
+# torch.cuda.manual_seed_all(2)
+torch.set_printoptions(precision=10)
+
+rows = 3
+cols = 20
+dims = rows, cols
+
+# Incoming gradient vectors we will use later
+# Need to create the fp16 versions as a half() copy of a Tensor first rather than
+# a Variable, because if you create pt_input_control as a Variable then say
+# pt_input_fp16 = pt_input_control.half(), you are accidentally making pt_input_fp16 part of 
+# pLpOutput_control's computational graph, so it will not be a leaf!
+pt_input_control = Variable(torch.randn(*dims).cuda(), requires_grad=True)
+# pt_input_control = torch.ones(*dims).cuda()
+pt_input_fp16    = pt_input_control.half()
+
+pt_output_fp16    = pt_input_fp16.sum()
+pt_output_control = pt_input_control.sum()
+print("After sum()s, before backwards:")
+print("pt_output_control.requires_grad = ", pt_output_control.requires_grad)
+print("pt_output_control.volatile = ", pt_output_control.volatile)
+print("pt_input_control.grad = ", pt_input_control.grad)
+print("pt_input_fp16.grad = ", pt_input_fp16.grad)
+print("\n\n")
+
+pt_output_fp16.backward() # pt_input_fp16 is not the leaf of this graph, pt_input_control is.
+print("After pt_output_fp16.backward():")
+print("pt_input_control.grad = ", pt_input_control.grad)
+print("pt_input_fp16.grad = ", pt_input_fp16.grad)
+print("\n\n")
+pt_output_control.backward() # Both backward() calls have pt_input_control as leaves, and so
+                             # will accumulate gradients into pt_input_control.grad
+print("After pt_output_control.backward():")
+print("pt_input_control.grad = ", pt_input_control.grad)
+print("pt_input_fp16.grad = ", pt_input_fp16.grad)
+print("\n\n")
+print("pt_output_control = ", pt_output_control)
+print("pt_output_fp16 = ", pt_output_fp16)
+
+