Merging in master

843cdbe0 · Michael Carilli · 724672d7 · 28097c99 · 843cdbe0 · 843cdbe0
Commit 843cdbe0 authored Apr 18, 2019 by Michael Carilli
20 changed files
--- a/apex/optimizers/csrc/fused_adam_cuda.cpp
+++ b/apex/optimizers/csrc/fused_adam_cuda.cpp
--- a/apex/optimizers/csrc/fused_adam_cuda_kernel.cu
+++ b/apex/optimizers/csrc/fused_adam_cuda_kernel.cu
@@ -10,6 +10,8 @@
 #include "ATen/AccumulateType.h"
 #include <THC/THCGeneral.h>
+#include "type_shim.h"
 typedef enum{
    ADAM_MODE_0   =0, // eps under square root
    ADAM_MODE_1   =1  // eps outside square root
@@ -29,8 +31,8 @@ __global__ void adam_cuda_kernel(
        const float step_size,
        const size_t tsize,
        adamMode_t mode,
-        const float decay) {
+        const float decay)
+{
        //Assuming 2D grids and 2D blocks
        const int blockId = gridDim.x * blockIdx.y + blockIdx.x;
        const int threadsPerBlock = blockDim.x * blockDim.y;
@@ -67,7 +69,9 @@ void fused_adam_cuda(
        int step,
        int mode,
        int bias_correction,
-        float decay) {
+        float decay)
+{
+//        using namespace at;
        //Get tensor size
        int tsize = p.numel();
@@ -91,6 +95,7 @@ void fused_adam_cuda(
 //all other values should be fp32 for half gradients
            AT_ASSERTM(p.type().scalarType() == at::ScalarType::Float, "expected parameter to be of float type");
 //dispatch is done on the gradient type
+            using namespace at; // prevents "toString is undefined" errors
            AT_DISPATCH_FLOATING_TYPES_AND_HALF(g.type(), "adam_cuda_kernel", ([&] {
                using accscalar_t = at::acc_type<scalar_t, true>;
                adam_cuda_kernel<accscalar_t, scalar_t><<<blocks,threadsPerBlock, 0, stream>>>(
@@ -109,6 +114,7 @@ void fused_adam_cuda(
                        decay);
            }));
      } else {
+            using namespace at;
            AT_DISPATCH_FLOATING_TYPES(g.type(), "adam_cuda_kernel", ([&] {
                adam_cuda_kernel<scalar_t, scalar_t><<<blocks,threadsPerBlock, 0, stream>>>(
                        p.data<scalar_t>(),

--- a/apex/normalization/csrc/layer_norm_cuda.cpp
+++ b/apex/normalization/csrc/layer_norm_cuda.cpp
@@ -5,7 +5,11 @@
 namespace {
 void compute_n1_n2(
    at::Tensor input,
+    #ifdef VERSION_GE_1_1
+    at::IntArrayRef normalized_shape,
+    #else
    at::IntList normalized_shape,
+    #endif
    int& n1,
    int& n2)
 {
@@ -22,7 +26,11 @@ void compute_n1_n2(
 }
 void check_args(
+    #ifdef VERSION_GE_1_1
+    at::IntArrayRef normalized_shape,
+    #else
    at::IntList normalized_shape,
+    #endif
    at::Tensor gamma,
    at::Tensor beta
    )
@@ -33,7 +41,11 @@ void check_args(
 void check_args(
    at::Tensor input,
+    #ifdef VERSION_GE_1_1
+    at::IntArrayRef normalized_shape,
+    #else
    at::IntList normalized_shape,
+    #endif
    int& n1,
    int& n2
    )
@@ -69,7 +81,11 @@ void check_args(
 void check_args(
    at::Tensor input,
+    #ifdef VERSION_GE_1_1
+    at::IntArrayRef normalized_shape,
+    #else
    at::IntList normalized_shape,
+    #endif
    at::Tensor gamma,
    at::Tensor beta,
    int& n1,
@@ -88,7 +104,11 @@ void cuda_layer_norm(
    at::Tensor* input,
    int n1,
    int n2,
+    #ifdef VERSION_GE_1_1
+    at::IntArrayRef normalized_shape,
+    #else
    at::IntList normalized_shape,
+    #endif
    at::Tensor* gamma,
    at::Tensor* beta,
    double epsilon);
@@ -99,7 +119,11 @@ void cuda_layer_norm(
 std::vector<at::Tensor> layer_norm(
    at::Tensor input,
+    #ifdef VERSION_GE_1_1
+    at::IntArrayRef normalized_shape,
+    #else
    at::IntList normalized_shape,
+    #endif
    double epsilon) {
  CHECK_INPUT(input);
  int n1,n2;
@@ -113,7 +137,11 @@ std::vector<at::Tensor> layer_norm(
 }
 std::vector<at::Tensor> layer_norm_affine(
    at::Tensor input,
+    #ifdef VERSION_GE_1_1
+    at::IntArrayRef normalized_shape,
+    #else
    at::IntList normalized_shape,
+    #endif
    at::Tensor gamma,
    at::Tensor beta,
    double epsilon) {
@@ -137,7 +165,11 @@ void cuda_layer_norm_gradient(
    at::Tensor* input,
    int n1,
    int n2,
+    #ifdef VERSION_GE_1_1
+    at::IntArrayRef normalized_shape,
+    #else
    at::IntList normalized_shape,
+    #endif
    at::Tensor* gamma,
    at::Tensor* beta,
    double epsilon,
@@ -151,7 +183,11 @@ at::Tensor layer_norm_gradient(
    at::Tensor mean,
    at::Tensor invvar,
    at::Tensor input,
+    #ifdef VERSION_GE_1_1
+    at::IntArrayRef normalized_shape,
+    #else
    at::IntList normalized_shape,
+    #endif
    double epsilon) {
  CHECK_INPUT(dout);
  CHECK_INPUT(mean);
@@ -170,7 +206,11 @@ std::vector<at::Tensor> layer_norm_gradient_affine(
    at::Tensor mean,
    at::Tensor invvar,
    at::Tensor input,
+    #ifdef VERSION_GE_1_1
+    at::IntArrayRef normalized_shape,
+    #else
    at::IntList normalized_shape,
+    #endif
    at::Tensor gamma,
    at::Tensor beta,
    double epsilon) {

--- a/apex/normalization/csrc/layer_norm_cuda_kernel.cu
+++ b/apex/normalization/csrc/layer_norm_cuda_kernel.cu
@@ -6,6 +6,8 @@
 #include <cuda.h>
 #include <cuda_runtime.h>
+#include "type_shim.h"
 template<typename U> __device__
 void cuWelfordOnlineSum(
  const U curr,
@@ -238,17 +240,20 @@ template<> double rsqrt(double v) {
 namespace {
 // This is the un-specialized struct.  Note that we prevent instantiation of this
 // struct by putting an undefined symbol in the function body so it won't compile.
+//  template <typename T>
+//  struct SharedMemory
+//  {
+//      // Ensure that we won't compile any un-specialized types
+//      __device__ T *getPointer()
+//      {
+//          extern __device__ void error(void);
+//          error();
+//          return NULL;
+//      }
+//  };
+// https://github.com/NVIDIA/apex/issues/246
 template <typename T>
-struct SharedMemory
+struct SharedMemory;
-{
-    // Ensure that we won't compile any un-specialized types
-    __device__ T *getPointer()
-    {
-        extern __device__ void error(void);
-        error();
-        return NULL;
-    }
-};
 template <>
 struct SharedMemory <float>
@@ -670,11 +675,16 @@ void cuda_layer_norm(
    at::Tensor* input,
    int n1,
    int n2,
+    #ifdef VERSION_GE_1_1
+    at::IntArrayRef normalized_shape,
+    #else
    at::IntList normalized_shape,
+    #endif
    at::Tensor* gamma,
    at::Tensor* beta,
    double epsilon)
 {
+    using namespace at;
    AT_DISPATCH_FLOATING_TYPES_AND_HALF(input->type(), "layer_norm_cuda_kernel", ([&] {
        using accscalar_t = at::acc_type<scalar_t, true>;
        HostApplyLayerNorm(
@@ -764,7 +774,11 @@ void cuda_layer_norm_gradient(
    at::Tensor* input,
    int n1,
    int n2,
+    #ifdef VERSION_GE_1_1
+    at::IntArrayRef normalized_shape,
+    #else
    at::IntList normalized_shape,
+    #endif
    at::Tensor* gamma,
    at::Tensor* beta,
    double epsilon,
@@ -772,6 +786,7 @@ void cuda_layer_norm_gradient(
    at::Tensor* grad_gamma,
    at::Tensor* grad_beta)
 {
+    using namespace at;
    AT_DISPATCH_FLOATING_TYPES_AND_HALF(input->type(), "cuComputeGradInput", ([&] {
        using accscalar_t = at::acc_type<scalar_t, true>;
        HostLayerNormGradient(

--- a/csrc/multi_tensor_apply.cuh
+++ b/csrc/multi_tensor_apply.cuh
-#include <torch/extension.h>
 #include <ATen/ATen.h>
 #include <ATen/AccumulateType.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <ATen/cuda/Exceptions.h>
 #include <assert.h>
-#include <cuda_runtime.h>
 // #include <iostream>
@@ -16,7 +14,7 @@
 constexpr int depth_to_max_tensors[5] = {110, 64, 48, 36, 30};
 constexpr int depth_to_max_blocks[5] = {320, 320, 320, 320, 320};
-template<int n> struct TensorList
+template<int n> struct TensorListMetadata
 {
  void* addresses[n][depth_to_max_tensors[n-1]];
  int sizes[depth_to_max_tensors[n-1]];
@@ -64,7 +62,7 @@ void multi_tensor_apply(
  int ntensors = tensor_lists[0].size();
-  TensorList<depth> tl;
+  TensorListMetadata<depth> tl;
  auto stream = at::cuda::getCurrentCUDAStream();

--- a/csrc/multi_tensor_axpby_kernel.cu
+++ b/csrc/multi_tensor_axpby_kernel.cu
+#include <ATen/ATen.h>
+#include <ATen/AccumulateType.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/Exceptions.h>
+// Another possibility:
+// #include <torch/all.h>
+#include <assert.h>
+#include "type_shim.h"
+#include "multi_tensor_apply.cuh"
+#define BLOCK_SIZE 512
+#define ILP 4
+template<typename x_t, typename y_t, typename out_t>
+struct AxpbyFunctor
+{
+   __device__ __forceinline__ void operator()(
+    int chunk_size,
+    volatile int* noop_gmem,
+    TensorListMetadata<3>& tl,
+    float a,
+    float b,
+    int arg_to_check)
+  {
+    // I'd like this kernel to propagate infs/nans.
+    // if(*noop_gmem == 1)
+    //   return;
+    int tensor_loc = tl.block_to_tensor[blockIdx.x];
+    int chunk_idx = tl.block_to_chunk[blockIdx.x];
+    int n = tl.sizes[tensor_loc];
+    x_t* x = (x_t*)tl.addresses[0][tensor_loc];
+    x += chunk_idx*chunk_size;
+    y_t* y = (y_t*)tl.addresses[1][tensor_loc];
+    y += chunk_idx*chunk_size;
+    out_t* out = (out_t*)tl.addresses[2][tensor_loc];
+    out += chunk_idx*chunk_size;
+    n -= chunk_idx*chunk_size;
+    // Non-divergent exit condition for __syncthreads, not necessary here
+    float xs[ILP];
+    float ys[ILP];
+    for(int i_start = 0;
+        i_start < n && i_start < chunk_size;
+        i_start += blockDim.x*ILP)
+    {
+      #pragma unroll
+      for(int ii = 0; ii < ILP; ii++)
+      {
+        xs[ii] = 0;
+        ys[ii] = 0;
+        int i = i_start + threadIdx.x + ii*blockDim.x;
+        if(i < n && i < chunk_size)
+        {
+          xs[ii] = static_cast<float>(x[i]);
+          ys[ii] = static_cast<float>(y[i]);
+        }
+      }
+      // see note in multi_tensor_scale_kernel.cu
+      #pragma unroll
+      for(int ii = 0; ii < ILP; ii++)
+      {
+        int i = i_start + threadIdx.x + ii*blockDim.x;
+        if(i < n && i < chunk_size)
+        {
+          out[i] = static_cast<out_t>(a*xs[ii] + b*ys[ii]);
+          bool finite = true;
+          if(arg_to_check == -1)
+            finite = (isfinite(xs[ii]) && isfinite(ys[ii]));
+          if(arg_to_check == 0)
+            finite = isfinite(xs[ii]);
+          if(arg_to_check == 1)
+            finite = isfinite(ys[ii]);
+          if(!finite)
+            *noop_gmem = 1; // Blindly fire off a write.  These will race but that's ok.
+        }
+      }
+    }
+  }
+};
+void multi_tensor_axpby_cuda(
+  int chunk_size,
+  at::Tensor noop_flag,
+  std::vector<std::vector<at::Tensor>> tensor_lists,
+  float a,
+  float b,
+  int arg_to_check)
+{
+  using namespace at;
+  // The output (downscaled) type is always float.
+  // If build times suffer, think about where to put this dispatch,
+  // and what logic should be moved out of multi_tensor_apply.
+  DISPATCH_FLOAT_AND_HALF(tensor_lists[0][0].scalar_type(), 0, "multi_tensor_axpby_cuda",
+    DISPATCH_FLOAT_AND_HALF(tensor_lists[1][0].scalar_type(), 1, "multi_tensor_axpby_cuda",
+      DISPATCH_FLOAT_AND_HALF(tensor_lists[2][0].scalar_type(), 2, "multi_tensor_axpby_cuda",
+           multi_tensor_apply<3>(
+             BLOCK_SIZE,
+             chunk_size,
+             noop_flag,
+             tensor_lists,
+             AxpbyFunctor<scalar_t_0, scalar_t_1, scalar_t_2>(),
+             a,
+             b,
+             arg_to_check); )))
+  AT_CUDA_CHECK(cudaGetLastError());
+  // AT_CUDA_CHECK(cudaDeviceSynchronize());
+}
--- a/csrc/multi_tensor_l2norm_kernel.cu
+++ b/csrc/multi_tensor_l2norm_kernel.cu
+#include <ATen/ATen.h>
+#include <ATen/AccumulateType.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/Exceptions.h>
+// Another possibility:
+// #include <torch/all.h>
+#include <assert.h>
+#include "type_shim.h"
+#include "multi_tensor_apply.cuh"
+#define BLOCK_SIZE 512
+#define ILP 4
+template<typename x_t>
+struct L2NormFunctor
+{
+   __device__ __forceinline__ void operator()(
+    int chunk_size,
+    volatile int* noop_gmem,
+    TensorListMetadata<1>& tl,
+    float* output)
+  {
+    // I'd like this kernel to propagate infs/nans.
+    // if(*noop_gmem == 1)
+    //   return;
+    int tensor_loc = tl.block_to_tensor[blockIdx.x];
+    int chunk_idx = tl.block_to_chunk[blockIdx.x];
+    int n = tl.sizes[tensor_loc];
+    x_t* x = (x_t*)tl.addresses[0][tensor_loc];
+    x += chunk_idx*chunk_size;
+    n -= chunk_idx*chunk_size;
+    __shared__ float vals[512];
+    // Non-divergent exit condition for __syncthreads, not necessary here
+    float val = 0;
+    for(int i = threadIdx.x; i < n && i < chunk_size; i += blockDim.x)
+    {
+      float next = static_cast<float>(x[i]);
+      val += next*next;
+    }
+    float final = reduce_block_into_lanes(vals, val);
+    if(threadIdx.x == 0)
+    {
+      if(!isfinite(final))
+        *noop_gmem = 1; // Blindly fire off a write.  These will race but that's ok.
+      output[blockIdx.x] += final;
+    }
+  }
+};
+__global__ void cleanup(float* x, float* ret)
+{
+  __shared__ float vals[512];
+  float val = 0;
+  if(threadIdx.x < 320)
+    val = x[threadIdx.x];
+  float final = reduce_block_into_lanes(vals, val);
+  if(threadIdx.x == 0)
+    *ret = sqrt(final);
+}
+at::Tensor multi_tensor_l2norm_cuda(
+  int chunk_size,
+  at::Tensor noop_flag,
+  std::vector<std::vector<at::Tensor>> tensor_lists)
+{
+  auto output = at::zeros({320}, tensor_lists[0][0].options().dtype(at::ScalarType::Float));
+  DISPATCH_FLOAT_AND_HALF(tensor_lists[0][0].scalar_type(), 0, "multi_tensor_l2norm_cuda",
+    multi_tensor_apply<1>(
+      BLOCK_SIZE,
+      chunk_size,
+      noop_flag,
+      tensor_lists,
+      L2NormFunctor<scalar_t_0>(),
+      output.data<float>());)
+  AT_CUDA_CHECK(cudaGetLastError());
+  // AT_CUDA_CHECK(cudaDeviceSynchronize());
+  // This involves one more small kernel launches, but will be negligible end to end.
+  // I could get rid of these by hacking the functor + multi tensor harness with persistence
+  // logic, but keeping it simple for now
+  auto ret = at::empty({1}, output.options());
+  auto stream = at::cuda::getCurrentCUDAStream();
+  cleanup<<<1, 512, 0, stream>>>(output.data<float>(), ret.data<float>());
+  return ret;
+}
--- a/csrc/multi_tensor_scale_kernel.cu
+++ b/csrc/multi_tensor_scale_kernel.cu
@@ -2,10 +2,15 @@
 #include <ATen/AccumulateType.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <ATen/cuda/Exceptions.h>
-#include "multi_tensor_apply.cuh"
+// Another possibility:
+// #include <torch/all.h>
 #include <assert.h>
-#include <cuda_runtime.h>
+// Stringstream is a big hammer, but I want to rely on operator<< for dtype.
+#include <sstream>
+#include "type_shim.h"
+#include "multi_tensor_apply.cuh"
 #define BLOCK_SIZE 512
 #define ILP 4
@@ -16,16 +21,12 @@ struct ScaleFunctor
   __device__ __forceinline__ void operator()(
    int chunk_size,
    volatile int* noop_gmem,
-    TensorList<2>& tl,
+    TensorListMetadata<2>& tl,
    float scale)
  {
-    __shared__ int noop_smem;
+    // I'd like this kernel to propagate infs/nans.
+    // if(*noop_gmem == 1)
-    if(threadIdx.x == 0)
+    //   return;
-      noop_smem = *noop_gmem;
-    __syncthreads();
-    if(noop_smem == 1)
-      return;
    int tensor_loc = tl.block_to_tensor[blockIdx.x];
    int chunk_idx = tl.block_to_chunk[blockIdx.x];
@@ -39,7 +40,7 @@ struct ScaleFunctor
    n -= chunk_idx*chunk_size;
-    // Non-divergent exit condition for the __syncthreads
+    // Non-divergent exit condition for __syncthreads, not necessary here
    float incoming_vals[ILP];
    for(int i_start = 0;
        i_start < n && i_start < chunk_size;
@@ -64,20 +65,12 @@ struct ScaleFunctor
      {
        int i = i_start + threadIdx.x + ii*blockDim.x;
        if(i < n && i < chunk_size)
-          if(isfinite(incoming_vals[ii]))
+        {
-            out[i] = static_cast<out_t>(incoming_vals[ii]*scale);
+          out[i] = static_cast<out_t>(incoming_vals[ii]*scale);
-          else
+          if(!isfinite(incoming_vals[ii]))
            *noop_gmem = 1; // Blindly fire off a write.  These will race but that's ok.
+        }
      }
-      // *noop_gmem = 1 is NOT guaranteed to be seen immediately by thread 0.  I wonder if
-      // we can rig block-wide and grid-wide short-circuiting with only one syncthreads.
-      // It's possible we can just lean on the cache (no smem or syncs) and still be fast.
-      if(threadIdx.x == 0)
-        noop_smem = *noop_gmem;
-      __syncthreads();
-      if(noop_smem == 1)
-        break;
    }
  }
 };
@@ -88,15 +81,17 @@ void multi_tensor_scale_cuda(
  std::vector<std::vector<at::Tensor>> tensor_lists,
  float scale)
 {
+  using namespace at;
  // The output (downscaled) type is always float.
  // If build times suffer, think about where to put this dispatch,
  // and what logic should be moved out of multi_tensor_apply.
  AT_DISPATCH_FLOATING_TYPES_AND_HALF(tensor_lists[0][0].type(),
     "multi_tensor_scale_cuda",
     [&]
     {
       // using accscalar_t = acc_type<scalar_t, true>;
-       switch(tensor_lists[1][0].type().scalarType())
+       switch(tensor_lists[1][0].scalar_type())
       {
         case at::ScalarType::Half:
           multi_tensor_apply<2>(
@@ -117,8 +112,10 @@ void multi_tensor_scale_cuda(
             scale);
           break;
         default:
-           AT_ERROR("multi_tensor_scale_cuda not implemented for output type = ",
+           std::stringstream ss;
-                    tensor_lists[1][0].type().toString());
+           ss << "multi_tensor_scale_cuda not implemented for output type = "
+              << tensor_lists[1][0].dtype();
+           AT_ERROR(ss.str().c_str());
       }
     });

--- a/csrc/scale_check_overflow_kernel.cu
+++ b/csrc/scale_check_overflow_kernel.cu
-#include <ATen/ATen.h>
-#include <ATen/AccumulateType.h>
-#include <ATen/cuda/CUDAContext.h>
-#include <ATen/cuda/Exceptions.h>
-#include <assert.h>
-#include <cuda_runtime.h>
-#define BLOCK_SIZE 256
-#define NBLOCKS 160*4
-#define ILP 4
-// It makes sense to lock the output type to fp32 because the downscaled
-// grads should be master grads (and in the case of Amp, the params and their
-// gradients should always be fp32).
-template<typename in_t>
-__global__ void scale_reduce_overflow(in_t* in,
-                                      float* out,
-                                      int n,
-                                      float scale,
-                                      volatile int* overflow_global)
-{
-  __shared__ int overflow;
-  float incoming_vals[4];
-  // Non-divergent exit condition for the __syncthreads
-  for(int chunk_start = blockIdx.x*blockDim.x*ILP;
-      chunk_start < n;
-      chunk_start += gridDim.x*blockDim.x*ILP)
-  {
-    if(threadIdx.x == 0)
-      overflow = *overflow_global;
-    __syncthreads();
-    if(overflow == 1)
-      break;
-    #pragma unroll
-    for(int ii = 0; ii < ILP; ii++)
-    {
-      incoming_vals[ii] = 0;
-      int i = chunk_start + threadIdx.x + ii*blockDim.x;
-      if(i < n)
-        incoming_vals[ii] = static_cast<float>(in[i]);
-    }
-    #pragma unroll
-    for(int ii = 0; ii < ILP; ii++)
-    {
-      int i = chunk_start + threadIdx.x + ii*blockDim.x;
-      if(i < n)
-        if(isfinite(incoming_vals[ii]))
-          out[i] = incoming_vals[ii]*scale;
-        else
-          *overflow_global = 1; // Blindly fire off a write.  These will race but that's ok.
-    }    // This is NOT guaranteed to be seen immediately by thread 0 on the next iteration.
-  }      // I wonder if there's a way we can rig the short-circuiting with only one syncthreads.
-}        // It's possible we can just lean on the cache (no smem or syncs) and still be fast.
-void scale_check_overflow_cuda
-  (const at::Tensor& grads,
-   float scale,
-   const at::Tensor& overflow_buf,
-   const at::Tensor& downscaled_grads)
-{
-  using namespace at;
-  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  int n = grads.numel();
-  // Lock the output (downscaled) type to float.
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(grads.type(),
-     "scale_check_overflow_cuda",
-     [&]
-     {
-       // using accscalar_t = acc_type<scalar_t, true>;
-       scale_reduce_overflow<<<NBLOCKS, BLOCK_SIZE, 0, stream>>>
-         (grads.data<scalar_t>(),
-          downscaled_grads.data<float>(),
-          n,
-          scale,
-          overflow_buf.data<int>());
-     });
-  AT_CUDA_CHECK(cudaGetLastError());
-}
--- a/csrc/type_shim.h
+++ b/csrc/type_shim.h
+#include <ATen/ATen.h>
+// Forward/backward compatiblity hack around
+// https://github.com/pytorch/pytorch/commit/3aeb78079bcd68282fe9117088e138b77318e288
+// pending more future-proof guidance from upstream.
+struct TypeShim
+{
+  const at::Type& payload;
+  TypeShim(const at::Type& type) : payload(type) {}
+  // Enable trivial conversion to a const at::Type& for pre-3aeb78
+  operator const at::Type&(){ return payload; };
+  // Enable dispatch switch statements to take *this directly for  post-3aeb78
+  operator at::ScalarType(){ return payload.scalarType(); };
+};
+#define DISPATCH_FLOAT_AND_HALF(TYPE, LEVEL, NAME, ...) \
+  switch(TYPE) \
+  { \
+    case at::ScalarType::Float: \
+    { \
+      using scalar_t_##LEVEL = float; \
+      __VA_ARGS__; \
+      break; \
+    } \
+    case at::ScalarType::Half: \
+    { \
+      using scalar_t_##LEVEL = at::Half; \
+      __VA_ARGS__; \
+      break; \
+    } \
+    default: \
+      AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'");  \
+  }
+template<typename T>
+__device__ __forceinline__ T reduce_block_into_lanes
+  (T *x,
+   T val,
+   int lanes=1,
+   bool share_result=false) // lanes is intended to be <= 32.
+{
+  int tid = threadIdx.x + threadIdx.y*blockDim.x;
+  int blockSize = blockDim.x*blockDim.y; // blockSize is intended to be a multiple of 32.
+  if(blockSize >= 64)
+  {
+    x[tid] = val;
+    __syncthreads();
+  }
+  #pragma unroll
+  for(int i = (blockSize >> 1); i >= 64; i >>= 1)
+  {
+    if(tid < i)
+      x[tid] = x[tid] + x[tid+i];
+    __syncthreads();
+  }
+  T final;
+  if(tid < 32)
+  {
+    if(blockSize >= 64)
+      final = x[tid] + x[tid+32];
+    else
+      final = val;
+    // __SYNCWARP();
+    #pragma unroll
+    for(int i = 16; i >= lanes; i >>= 1)
+      final = final + __shfl_down_sync(0xffffffff, final, i);
+  }
+  if(share_result)
+  {
+    if(tid < lanes)
+      x[tid] = final; // EpilogueOp
+    // Make sure the smem result is visible to all warps.
+    __syncthreads();
+  }
+  return final;
+}
--- a/csrc/welford.cu
+++ b/csrc/welford.cu
@@ -3,13 +3,13 @@
 #include <ATen/AccumulateType.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <vector>
+#include "type_shim.h"
 __device__ __forceinline__ int lastpow2(int n)
 {
@@ -844,16 +844,19 @@ std::vector<at::Tensor> welford_mean_var_CUDA(const at::Tensor input) {
  auto stream = at::cuda::getCurrentCUDAStream();
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.type(), "welford_mean_var_kernel", ([&] {
+  {
-    using accscalar_t = at::acc_type<scalar_t, true>;
+    using namespace at;
-    welford_kernel<scalar_t, accscalar_t, accscalar_t><<<grid, block, 0, stream>>>(
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.type(), "welford_mean_var_kernel", ([&] {
-        input.data<scalar_t>(),
+      using accscalar_t = at::acc_type<scalar_t, true>;
-        out_mean.data<accscalar_t>(),
+      welford_kernel<scalar_t, accscalar_t, accscalar_t><<<grid, block, 0, stream>>>(
-        out_var_biased.data<accscalar_t>(),
+          input.data<scalar_t>(),
-        batch_size,
+          out_mean.data<accscalar_t>(),
-        feature_size,
+          out_var_biased.data<accscalar_t>(),
-        space_size);
+          batch_size,
-  }));
+          feature_size,
+          space_size);
+    }));
+  }
  return {out_mean, out_var_biased};
 }
@@ -881,6 +884,7 @@ at::Tensor batchnorm_forward_CUDA(
  if (input.type().scalarType() == at::ScalarType::Half
      && weight.has_value() &&
      weight.value().type().scalarType() == at::ScalarType::Float) {
+    using namespace at;
    AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.type(), "batchnorm_forward", ([&] {
      using accscalar_t = at::acc_type<scalar_t, true>;
      batchnorm_forward_kernel<scalar_t, accscalar_t, accscalar_t><<<grid, block, 0, stream>>>(
@@ -898,6 +902,7 @@ at::Tensor batchnorm_forward_CUDA(
      AT_CHECK(input.type().scalarType() == weight.value().type().scalarType(),
          "input.type().scalarType() is not supported with weight.type().scalarType()");
    }
+    using namespace at;
    AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.type(), "batchnorm_forward", ([&] {
      using accscalar_t = at::acc_type<scalar_t, true>;
      batchnorm_forward_kernel<scalar_t, accscalar_t, scalar_t><<<grid, block, 0, stream>>>(
@@ -950,6 +955,7 @@ std::vector<at::Tensor> reduce_bn_CUDA(
  if (input.type().scalarType() == at::ScalarType::Half
      && weight.has_value() &&
      weight.value().type().scalarType() == at::ScalarType::Float) {
+    using namespace at;
    AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.type(), "batchnorm_backward_reduce", ([&] {
      using accscalar_t = at::acc_type<scalar_t, true>;
      reduce_bn_kernel<scalar_t, accscalar_t, accscalar_t><<<grid, block, 0, stream>>>(
@@ -970,6 +976,7 @@ std::vector<at::Tensor> reduce_bn_CUDA(
        AT_CHECK(input.type().scalarType() == weight.value().type().scalarType(),
            "input.type().scalarType() is not supported with weight.type().scalarType()");
    }
+    using namespace at;
    AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.type(), "batchnorm_backward_reduce", ([&] {
      using accscalar_t = at::acc_type<scalar_t, true>;
      reduce_bn_kernel<scalar_t, accscalar_t, scalar_t><<<grid, block, 0, stream>>>(
@@ -1017,6 +1024,7 @@ at::Tensor batchnorm_backward_CUDA(
  if (input.type().scalarType() == at::ScalarType::Half
      && weight.has_value() &&
      weight.value().type().scalarType() == at::ScalarType::Float) {
+    using namespace at;
    AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.type(), "batchnorm_backward", ([&] {
      using accscalar_t = at::acc_type<scalar_t, true>;
      batchnorm_backward_kernel<scalar_t, accscalar_t, accscalar_t><<<grid, block, 0, stream>>>(
@@ -1036,6 +1044,7 @@ at::Tensor batchnorm_backward_CUDA(
      AT_CHECK(input.type().scalarType() == weight.value().type().scalarType(),
          "input.type().scalarType() is not supported with weight.type().scalarType()");
    }
+    using namespace at;
    AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.type(), "batchnorm_backward", ([&] {
      using accscalar_t = at::acc_type<scalar_t, true>;
      batchnorm_backward_kernel<scalar_t, accscalar_t, scalar_t><<<grid, block, 0, stream>>>(
@@ -1072,18 +1081,21 @@ std::vector<at::Tensor> welford_parallel_CUDA(const at::Tensor mean_feature_node
  auto stream = at::cuda::getCurrentCUDAStream();
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(mean_feature_nodes.type(), "welford_parallel_kernel", ([&] {
+  {
-    welford_kernel_parallel<scalar_t><<<grid, block, 0, stream>>>(
+    using namespace at;
-        mean_feature_nodes.data<scalar_t>(),
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(mean_feature_nodes.type(), "welford_parallel_kernel", ([&] {
-        var_biased.data<scalar_t>(),
+      welford_kernel_parallel<scalar_t><<<grid, block, 0, stream>>>(
-        out_mean.data<scalar_t>(),
+          mean_feature_nodes.data<scalar_t>(),
-        out_var.data<scalar_t>(),
+          var_biased.data<scalar_t>(),
-        inv_std.data<scalar_t>(),
+          out_mean.data<scalar_t>(),
-        world_size,
+          out_var.data<scalar_t>(),
-        feature_size,
+          inv_std.data<scalar_t>(),
-        eps,
+          world_size,
-        numel);
+          feature_size,
-  }));
+          eps,
+          numel);
+    }));
+  }
  return {out_mean, out_var, inv_std};
 }
@@ -1111,21 +1123,23 @@ std::vector<at::Tensor> welford_mean_var_c_last_CUDA(const at::Tensor input) {
  auto stream = at::cuda::getCurrentCUDAStream();
+  {
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.type(), "welford_mean_var_c_last", ([&] {
+    using namespace at;
-    using accscalar_t = at::acc_type<scalar_t, true>;
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.type(), "welford_mean_var_c_last", ([&] {
-    accscalar_t* staging_data_ptr = grid.y > 1 ? staging_data.data<accscalar_t>() : nullptr;
+      using accscalar_t = at::acc_type<scalar_t, true>;
-    int* semaphores_ptr = grid.y > 1 ? semaphores.data<int>() : nullptr;
+      accscalar_t* staging_data_ptr = grid.y > 1 ? staging_data.data<accscalar_t>() : nullptr;
-    welford_kernel_c_last<scalar_t, accscalar_t, accscalar_t, ELEMENTS_PER_ITER>
+      int* semaphores_ptr = grid.y > 1 ? semaphores.data<int>() : nullptr;
-        <<<grid, block, 0, stream>>>(
+      welford_kernel_c_last<scalar_t, accscalar_t, accscalar_t, ELEMENTS_PER_ITER>
-        input.data<scalar_t>(),
+          <<<grid, block, 0, stream>>>(
-        out_mean.data<accscalar_t>(),
+          input.data<scalar_t>(),
-        out_var_biased.data<accscalar_t>(),
+          out_mean.data<accscalar_t>(),
-        staging_data_ptr,
+          out_var_biased.data<accscalar_t>(),
-        semaphores_ptr,
+          staging_data_ptr,
-        reduction_size,
+          semaphores_ptr,
-        stride);
+          reduction_size,
-  }));
+          stride);
+    }));
+  }
  return {out_mean, out_var_biased};
 }
@@ -1149,6 +1163,7 @@ at::Tensor batchnorm_forward_c_last_CUDA(
  if (input.type().scalarType() == at::ScalarType::Half
      && weight.has_value() && weight.value().type().scalarType() == at::ScalarType::Float) {
+    using namespace at;
    AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.type(), "batchnorm_forward", ([&] {
      using accscalar_t = at::acc_type<scalar_t, true>;
      batchnorm_forward_c_last_kernel<scalar_t, accscalar_t, accscalar_t, ELEMENTS_PER_ITER>
@@ -1167,6 +1182,7 @@ at::Tensor batchnorm_forward_c_last_CUDA(
      AT_CHECK(input.type().scalarType() == weight.value().type().scalarType(),
          "input.type().scalarType() is not supported with weight.type().scalarType()");
    }
+    using namespace at;
    AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.type(), "batchnorm_forward", ([&] {
      using accscalar_t = at::acc_type<scalar_t, true>;
      batchnorm_forward_c_last_kernel<scalar_t, accscalar_t, scalar_t, ELEMENTS_PER_ITER>
@@ -1222,6 +1238,7 @@ std::vector<at::Tensor> reduce_bn_c_last_CUDA(
  if (input.type().scalarType() == at::ScalarType::Half
      && weight.has_value()
      && weight.value().type().scalarType() == at::ScalarType::Float) {
+    using namespace at;
    AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.type(), "batchnorm_backward_reduce", ([&] {
      using accscalar_t = at::acc_type<scalar_t, true>;
      accscalar_t* staging_data_ptr = grid.y > 1 ? staging_data.data<accscalar_t>() : nullptr;
@@ -1246,6 +1263,7 @@ std::vector<at::Tensor> reduce_bn_c_last_CUDA(
      AT_CHECK(input.type().scalarType() == weight.value().type().scalarType(),
          "input.type().scalarType() is not supported with weight.type().scalarType()");
    }
+    using namespace at;
    AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.type(), "batchnorm_backward_reduce", ([&] {
      using accscalar_t = at::acc_type<scalar_t, true>;
      accscalar_t* staging_data_ptr = grid.y > 1 ? staging_data.data<accscalar_t>() : nullptr;
@@ -1291,6 +1309,7 @@ at::Tensor batchnorm_backward_c_last_CUDA(
  if (input.type().scalarType() == at::ScalarType::Half
      && weight.has_value() && weight.value().type().scalarType() == at::ScalarType::Float) {
+    using namespace at;
    AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.type(), "batchnorm_forward", ([&] {
      using accscalar_t = at::acc_type<scalar_t, true>;
      batchnorm_backward_c_last_kernel<scalar_t, accscalar_t, accscalar_t, ELEMENTS_PER_ITER>
@@ -1311,6 +1330,7 @@ at::Tensor batchnorm_backward_c_last_CUDA(
      AT_CHECK(input.type().scalarType() == weight.value().type().scalarType(),
          "input.type().scalarType() is not supported with weight.type().scalarType()");
    }
+    using namespace at;
    AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.type(), "batchnorm_forward", ([&] {
      using accscalar_t = at::acc_type<scalar_t, true>;
      batchnorm_backward_c_last_kernel<scalar_t, accscalar_t, scalar_t, ELEMENTS_PER_ITER>

--- a/docs/source/advanced.rst
+++ b/docs/source/advanced.rst
@@ -15,31 +15,43 @@ is under construction.
 Gradient clipping
 -----------------
-If Amp uses master params distinct from the model params,
+Amp calls the params owned directly by the optimizer's ``param_groups`` the "master params."
-then the params ``step()``\ ed by the optimizer are the master params,
-and it is the master gradients (rather than the model gradients) that must be clipped.
-If Amp is not using master params distinct from the model params, then the optimizer
+These master params may be fully or partially distinct from ``model.parameters()``.
-directly steps the model params, and the model grads must be clipped.
+For example, with `opt_level="O2"`_, ``amp.initialize`` casts most model params to FP16,
+creates an FP32 master param outside the model for each newly-FP16 model param,
+and updates the optimizer's ``param_groups`` to point to these FP32 params.
-In both cases, correct practice is to clip the gradients of the params that are about to be stepped **by the optimizer** (which may be distinct from ``model.parameters()``).
+The master params owned by the optimizer's ``param_groups`` may also fully coincide with the
+model params, which is typically true for ``opt_level``\s ``O0``, ``O1``, and ``O3``.
-Also, if Amp uses loss scaling, gradients must be clipped after they have been unscaled.
+In all cases, correct practice is to clip the gradients of the params that are guaranteed to be
+owned **by the optimizer's** ``param_groups``, instead of those retrieved via ``model.parameters()``.
-The following pattern accounts for all possibilities, and should be correct for
+Also, if Amp uses loss scaling, gradients must be clipped after they have been unscaled
-any ``opt_level``::
+(which occurs during exit from the ``amp.scale_loss`` context manager).
+The following pattern should be correct for any ``opt_level``::
    with amp.scale_loss(loss, optimizer) as scaled_loss:
        scaled_loss.backward()
        # Gradients are unscaled during context manager exit.
-    # Now it's safe to clip:
+    # Now it's safe to clip.  Replace
+    # torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm)
+    # with
    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), max_norm)
    # or
    torch.nn.utils.clip_grad_value_(amp.master_params(optimizer), max_)
 Note the use of the utility function ``amp.master_params(optimizer)``,
 which returns a generator-expression that iterates over the
-params that the optimizer steps (master params if enabled, otherwise model params).
+params in the optimizer's ``param_groups``.
+Also note that ``clip_grad_norm_(amp.master_params(optimizer), max_norm)`` is invoked
+*instead of*, not *in addition to*, ``clip_grad_norm_(model.parameters(), max_norm)``.
+.. _`opt_level="O2"`:
+    https://nvidia.github.io/apex/amp.html#o2-fast-mixed-precision
 Custom/user-defined autograd functions
 --------------------------------------
@@ -56,8 +68,11 @@ Forcing particular layers/functions to a desired type
 I'm still working on a generalizable exposure for this that won't require user-side code divergence
 across different ``opt-level``\ s.
-Multiple models/optimizers
+Multiple models/optimizers/losses
--------------------------
+---------------------------------
+Initialization with multiple models/optimizers
+**********************************************
 ``amp.initialize``'s optimizer argument may be a single optimizer or a list of optimizers,
 as long as the output you accept has the same type.
@@ -65,35 +80,88 @@ Similarly, the ``model`` argument may be a single model or a list of models, as
 output matches.  The following calls are all legal::
    model, optim = amp.initialize(model, optim,...)
-    model, [optim1, optim2] = amp.initialize(model, [optim1, optim2],...)
+    model, [optim0, optim1] = amp.initialize(model, [optim0, optim1],...)
-    [model1, model2], optim = amp.initialize([model1, model2], optim,...)
+    [model0, model1], optim = amp.initialize([model0, model1], optim,...)
-    [model1, model2], [optim1, optim2] = amp.initialize([model1, model2], [optim1, optim2],...)
+    [model0, model1], [optim0, optim1] = amp.initialize([model0, model1], [optim0, optim1],...)
-Whenever you invoke a backward pass, the optimizer you should pass to ``amp.scaled_loss`` is whatever
+Backward passes with multiple optimizers
-optimizer owns the parameters for which this particular backward pass is creating gradients.
+****************************************
-Multiple backward passes per iteration
+Whenever you invoke a backward pass, the ``amp.scale_loss`` context manager must receive
--------------------------------------
+**all the optimizers that own any params for which the current backward pass is creating gradients.**
+This is true even if each optimizer owns only some, but not all, of the params that are about to
+receive gradients.
-If you want to accumulate gradients from multiple losses for the params owned by a given optimizer,
+If, for a given backward pass, there's only one optimizer whose params are about to receive gradients,
-you must invoke ``with amp.scale_loss(..., delay_unscale=True)`` for all backward passes except
+you may pass that optimizer directly to ``amp.scale_loss``.  Otherwise, you must pass the
-the last::
+list of optimizers whose params are about to receive gradients::
-    # delay_unscale=True for the first two losses
+    # loss0 accumulates gradients only into params owned by optim0:
-    with amp.scale_loss(loss1, optimizer, delay_unscale=True) as scaled_loss:
+    with amp.scale_loss(loss0, optim0) as scaled_loss:
        scaled_loss.backward()
-    with amp.scale_loss(loss2, optimizer, delay_unscale=True) as scaled_loss:
+    # loss1 accumulates gradients only into params owned by optim1:
+    with amp.scale_loss(loss1, optim1) as scaled_loss:
        scaled_loss.backward()
-    # Don't delay_unscale for the final loss 
-    with amp.scale_loss(loss3, optimizer) as scaled_loss:
+    # loss2 accumulates gradients into some params owned by optim0
+    # and some params owned by optim1
+    with amp.scale_loss(loss2, [optim0, optim1]) as scaled_loss:
        scaled_loss.backward()
-    optimizer.step()
+Optionally have Amp use a different loss scaler per-loss
+********************************************************
+By default, Amp maintains a single global loss scaler that will be used for all backward passes
+(all invocations of ``with amp.scale_loss(...)``).  No additional arguments to ``amp.initialize``
+or ``amp.scale_loss`` are required to use the global loss scaler.  The code snippets above with
+multiple optimizers/backward passes use the single global loss scaler under the hood,
+and they should "just work."
+However, you can optionally tell Amp to maintain a loss scaler per-loss, which gives Amp increased
+numerical flexibility.  This is accomplished by supplying the ``num_losses`` argument to
+``amp.initialize`` (which tells Amp how many backward passes you plan to invoke, and therefore
+how many loss scalers Amp should create), then supplying the ``loss_id`` argument to each of your
+backward passes (which tells Amp the loss scaler to use for this particular backward pass)::
+    model, [optim0, optim1] = amp.initialize(model, [optim0, optim1], ..., num_losses=3)
+    with amp.scale_loss(loss0, optim0, loss_id=0) as scaled_loss:
+        scaled_loss.backward()
+    with amp.scale_loss(loss1, optim1, loss_id=1) as scaled_loss:
+        scaled_loss.backward()
+    with amp.scale_loss(loss2, [optim0, optim1], loss_id=2) as scaled_loss:
+        scaled_loss.backward()
+``num_losses`` and ``loss_id``\ s should be specified purely based on the set of
+losses/backward passes.  The use of multiple optimizers, or association of single or
+multiple optimizers with each backward pass, is unrelated.
 Gradient accumulation across iterations
 ---------------------------------------
-Pass ``delay_unscale=True`` to ``amp.scale_loss`` until you're ready to ``step()``::
+The following should "just work," and properly accommodate multiple models/optimizers/losses, as well as
+gradient clipping via the `instructions above`_::
+    if iter%iters_to_accumulate == 0:
+        # Every iters_to_accumulate iterations, unscale and step
+        with amp.scale_loss(loss, optimizer) as scaled_loss:
+            scaled_loss.backward()
+        # Gradient clipping if desired:
+        # torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), max_norm)
+        optimizer.step()
+        optimizer.zero_grad()
+    else:
+        # Otherwise, accumulate gradients, don't unscale or step.
+        with amp.scale_loss(loss, optimizer) as scaled_loss:
+            scaled_loss.backward()
+As a minor performance optimization, you can pass ``delay_unscale=True``
+to ``amp.scale_loss`` until you're ready to ``step()``.  You should only attempt ``delay_unscale=True``
+if you're sure you know what you're doing, because the interaction with gradient clipping and
+multiple models/optimizers/losses can become tricky.::
    if iter%iters_to_accumulate == 0:
        # Every iters_to_accumulate iterations, unscale and step
@@ -102,6 +170,48 @@ Pass ``delay_unscale=True`` to ``amp.scale_loss`` until you're ready to ``step()
        optimizer.step()
        optimizer.zero_grad()
    else:
-        # Otherwise, just accumulate gradients, don't unscale or step. 
+        # Otherwise, accumulate gradients, don't unscale or step.
        with amp.scale_loss(loss, optimizer, delay_unscale=True) as scaled_loss:
            scaled_loss.backward()
+.. _`instructions above`:
+    https://nvidia.github.io/apex/advanced.html#gradient-clipping
+Custom data batch types
+-----------------------
+The intention of Amp is that you never need to cast your input data manually, regardless of
+``opt_level``.  Amp accomplishes this by patching any models' ``forward`` methods to cast
+incoming data appropriately for the ``opt_level``.  But to cast incoming data,
+Amp needs to know how.  The patched ``forward`` will recognize and cast floating-point Tensors
+(non-floating-point Tensors like IntTensors are not touched) and
+Python containers of floating-point Tensors.  However, if you wrap your Tensors in a custom class,
+the casting logic doesn't know how to drill
+through the tough custom shell to access and cast the juicy Tensor meat within.  You need to tell
+Amp how to cast your custom batch class, by assigning it a ``to`` method that accepts a ``torch.dtype``
+(e.g., ``torch.float16`` or ``torch.float32``) and returns an instance of the custom batch cast to
+``dtype``.  The patched ``forward`` checks for the presence of your ``to`` method, and will
+invoke it with the correct type for the ``opt_level``.
+Example::
+    class CustomData(object):
+        def __init__(self):
+            self.tensor = torch.cuda.FloatTensor([1,2,3])
+        def to(self, dtype):
+            self.tensor = self.tensor.to(dtype)
+            return self
+.. warning::
+    Amp also forwards numpy ndarrays without casting them.  If you send input data as a raw, unwrapped
+    ndarray, then later use it to create a Tensor within your ``model.forward``, this Tensor's type will
+    not depend on the ``opt_level``, and may or may not be correct.  Users are encouraged to pass
+    castable data inputs (Tensors, collections of Tensors, or custom classes with a ``to`` method)
+    wherever possible.
+.. note::
+    Amp does not call ``.cuda()`` on any Tensors for you.  Amp assumes that your original script
+    is already set up to move Tensors from the host to the device as needed.
--- a/docs/source/amp.rst
+++ b/docs/source/amp.rst
@@ -13,6 +13,13 @@ on the Github page.
 GANs are a tricky case that many people have requested.  A `comprehensive DCGAN example`_
 is under construction.
+If you already implemented Amp based on the instructions below, but it isn't behaving as expected,
+please review `Advanced Amp Usage`_ to see if any topics match your use case.  If that doesn't help,
+`file an issue`_.
+.. _`file an issue`:
+    https://github.com/NVIDIA/apex/issues
 ``opt_level``\ s and Properties
 -------------------------------
@@ -26,8 +33,8 @@ override the defaults established by the ``opt_level``.
 Example::
-        # Declare model and optimizer as usual
+        # Declare model and optimizer as usual, with default (FP32) precision
-        model = torch.nn.Linear(D_in, D_out).cuda().half()
+        model = torch.nn.Linear(D_in, D_out).cuda()
        optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
        # Allow Amp to perform casts as required by the opt_level
@@ -55,6 +62,9 @@ In this way, there's no risk adhering to the Amp API, and a lot of potential per
 .. _`comprehensive DCGAN example`:
    https://github.com/NVIDIA/apex/tree/master/examples/dcgan
+.. _`Advanced Amp Usage`:
+    https://nvidia.github.io/apex/advanced.html
 Properties
 **********
@@ -102,9 +112,8 @@ Your incoming model should be FP32 already, so this is likely a no-op.
 |
 |
-``O1``:  Conservative Mixed Precision
+``O1``:  Mixed Precision (recommended for typical use)
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 Patch all Torch functions and Tensor methods to cast their inputs according to a whitelist-blacklist
 model.  Whitelist ops (for example, Tensor Core-friendly ops like GEMMs and convolutions) are performed
 in FP16.  Blacklist ops that benefit from FP32 precision (for example, softmax)
@@ -119,11 +128,14 @@ are performed in FP32.  ``O1`` also uses dynamic loss scaling, unless overridden
 |
 |
-``O2``:  Fast Mixed Precision
+``O2``:  "Almost FP16" Mixed Precision
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 ``O2`` casts the model weights to FP16,
 patches the model's ``forward`` method to cast input
 data to FP16, keeps batchnorms in FP32, maintains FP32 master weights,
+updates the optimizer's ``param_groups`` so that the ``optimizer.step()``
+acts directly on the FP32 weights (followed by FP32 master weight->FP16 model weight
+copies if necessary),
 and implements dynamic loss scaling (unless overridden).
 Unlike ``O1``, ``O2`` does not patch Torch functions or Tensor methods.
@@ -170,7 +182,7 @@ Advanced use cases
 The unified Amp API supports gradient accumulation across iterations,
 multiple backward passes per iteration, multiple models/optimizers,
-and custom/user-defined autograd functions.  Gradient clipping and GANs also
+custom/user-defined autograd functions, and custom data batch classes.  Gradient clipping and GANs also
 require special treatment, but this treatment does not need to change
 for different ``opt_level``\ s.  Further details can be found here:

--- a/examples/imagenet/main_amp.py
+++ b/examples/imagenet/main_amp.py
@@ -68,7 +68,6 @@ parser.add_argument("--local_rank", default=0, type=int)
 parser.add_argument('--sync_bn', action='store_true',
                    help='enabling apex sync BN.')
-parser.add_argument('--has-ext', action='store_true')
 parser.add_argument('--opt-level', type=str)
 parser.add_argument('--keep-batchnorm-fp32', type=str, default=None)
 parser.add_argument('--loss-scale', type=str, default=None)
@@ -118,7 +117,7 @@ def main():
    args.world_size = 1
    if args.distributed:
-        args.gpu = args.local_rank % torch.cuda.device_count()
+        args.gpu = args.local_rank
        torch.cuda.set_device(args.gpu)
        torch.distributed.init_process_group(backend='nccl',
                                             init_method='env://')
@@ -334,10 +333,8 @@ def train(train_loader, model, criterion, optimizer, epoch):
        optimizer.step()
        if args.prof: torch.cuda.nvtx.range_pop()
-        input, target = prefetcher.next()
        if i%args.print_freq == 0:
-            # Every print_freq iterations, check the loss accuracy and speed.
+            # Every print_freq iterations, check the loss, accuracy, and speed.
            # For best performance, it doesn't make sense to print these metrics every
            # iteration, since they incur an allreduce and some host<->device syncs.
@@ -374,6 +371,8 @@ def train(train_loader, model, criterion, optimizer, epoch):
                       batch_time=batch_time,
                       loss=losses, top1=top1, top5=top5))
+        input, target = prefetcher.next()
 def validate(val_loader, model, criterion):
    batch_time = AverageMeter()

--- a/examples/simple/distributed/README.md
+++ b/examples/simple/distributed/README.md
+**distributed_data_parallel.py** and **run.sh** show an example using Amp with
+[apex.parallel.DistributedDataParallel](https://nvidia.github.io/apex/parallel.html) or
+[torch.nn.parallel.DistributedDataParallel](https://pytorch.org/docs/stable/nn.html#distributeddataparallel)
+and the Pytorch multiprocess launcher script,
+[torch.distributed.launch](https://pytorch.org/docs/master/distributed.html#launch-utility).
+The use of `Amp` with DistributedDataParallel does not need to change from ordinary 
+single-process use.  The only gotcha is that wrapping your model with `DistributedDataParallel` must
+come after the call to `amp.initialize`.  Test via
+```bash
+bash run.sh
+```
+**This is intended purely as an instructional example, not a performance showcase.**
--- a/examples/simple/distributed/distributed_data_parallel.py
+++ b/examples/simple/distributed/distributed_data_parallel.py
+import torch
+import argparse
+import os
+from apex import amp
+# FOR DISTRIBUTED: (can also use torch.nn.parallel.DistributedDataParallel instead)
+from apex.parallel import DistributedDataParallel
+parser = argparse.ArgumentParser()
+# FOR DISTRIBUTED:  Parse for the local_rank argument, which will be supplied
+# automatically by torch.distributed.launch.
+parser.add_argument("--local_rank", default=0, type=int)
+args = parser.parse_args()
+# FOR DISTRIBUTED:  If we are running under torch.distributed.launch,
+# the 'WORLD_SIZE' environment variable will also be set automatically.
+args.distributed = False
+if 'WORLD_SIZE' in os.environ:
+    args.distributed = int(os.environ['WORLD_SIZE']) > 1
+if args.distributed:
+    # FOR DISTRIBUTED:  Set the device according to local_rank.
+    torch.cuda.set_device(args.local_rank)
+    # FOR DISTRIBUTED:  Initialize the backend.  torch.distributed.launch will provide
+    # environment variables, and requires that you use init_method=`env://`.
+    torch.distributed.init_process_group(backend='nccl',
+                                         init_method='env://')
+torch.backends.cudnn.benchmark = True
+N, D_in, D_out = 64, 1024, 16
+# Each process receives its own batch of "fake input data" and "fake target data."
+# The "training loop" in each process just uses this fake batch over and over.
+# https://github.com/NVIDIA/apex/tree/master/examples/imagenet provides a more realistic
+# example of distributed data sampling for both training and validation.
+x = torch.randn(N, D_in, device='cuda')
+y = torch.randn(N, D_out, device='cuda')
+model = torch.nn.Linear(D_in, D_out).cuda()
+optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
+model, optimizer = amp.initialize(model, optimizer, opt_level="O1")
+if args.distributed:
+    # FOR DISTRIBUTED:  After amp.initialize, wrap the model with
+    # apex.parallel.DistributedDataParallel.
+    model = DistributedDataParallel(model)
+    # torch.nn.parallel.DistributedDataParallel is also fine, with some added args:
+    # model = torch.nn.parallel.DistributedDataParallel(model,
+    #                                                   device_ids=[args.local_rank],
+    #                                                   output_device=args.local_rank)
+loss_fn = torch.nn.MSELoss()
+for t in range(500):
+    optimizer.zero_grad()
+    y_pred = model(x)
+    loss = loss_fn(y_pred, y)
+    with amp.scale_loss(loss, optimizer) as scaled_loss:
+        scaled_loss.backward()
+    optimizer.step()
+if args.local_rank == 0:
+    print("final loss = ", loss)
--- a/examples/simple/distributed/run.sh
+++ b/examples/simple/distributed/run.sh
+#!/bin/bash
+python -m torch.distributed.launch --nproc_per_node=2 distributed_data_parallel.py
--- a/setup.py
+++ b/setup.py
 import torch
 from setuptools import setup, find_packages
+import subprocess
 import sys
 if not torch.cuda.is_available():
-    print("Warning: Torch did not find available GPUs on this system.\n",
+    print("\nWarning: Torch did not find available GPUs on this system.\n",
-          "If your intention is to cross-compile, this is not an error.")
+          "If your intention is to cross-compile, this is not an error.\n")
 print("torch.__version__  = ", torch.__version__)
 TORCH_MAJOR = int(torch.__version__.split('.')[0])
@@ -21,7 +22,7 @@ ext_modules = []
 if "--cpp_ext" in sys.argv or "--cuda_ext" in sys.argv:
    if TORCH_MAJOR == 0:
        raise RuntimeError("--cpp_ext requires Pytorch 1.0 or later, "
-                           "found torch.__version__ = {}".format(torch.__version))
+                           "found torch.__version__ = {}".format(torch.__version__))
    from torch.utils.cpp_extension import BuildExtension
    cmdclass['build_ext'] = BuildExtension
@@ -32,6 +33,25 @@ if "--cpp_ext" in sys.argv:
        CppExtension('apex_C',
                     ['csrc/flatten_unflatten.cpp',]))
+def check_cuda_torch_binary_vs_bare_metal(cuda_dir):
+    raw_output = subprocess.check_output([cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True)
+    output = raw_output.split()
+    release_idx = output.index("release") + 1
+    release = output[release_idx].split(".")
+    bare_metal_major = release[0]
+    bare_metal_minor = release[1][0]
+    torch_binary_major = torch.version.cuda.split(".")[0]
+    torch_binary_minor = torch.version.cuda.split(".")[1]
+    print("\nCompiling cuda extensions with")
+    print(raw_output + "from " + cuda_dir + "/bin\n")
+    if (bare_metal_major != torch_binary_major) or (bare_metal_minor != torch_binary_minor):
+        # TODO:  make this a hard error?
+        print("\nWarning:  Cuda extensions are being compiled with a version of Cuda that does "
+              "not match the version used to compile Pytorch binaries.\n")
+    print("Pytorch binaries were compiled with Cuda {}\n".format(torch.version.cuda))
 if "--cuda_ext" in sys.argv:
    from torch.utils.cpp_extension import CUDAExtension
    sys.argv.remove("--cuda_ext")
@@ -39,6 +59,8 @@ if "--cuda_ext" in sys.argv:
    if torch.utils.cpp_extension.CUDA_HOME is None:
        raise RuntimeError("--cuda_ext was requested, but nvcc was not found.  Are you sure your environment has nvcc available?  If you're installing within a container from https://hub.docker.com/r/pytorch/pytorch, only images whose names contain 'devel' will provide nvcc.")
    else:
+        check_cuda_torch_binary_vs_bare_metal(torch.utils.cpp_extension.CUDA_HOME)
        # Set up macros for forward/backward compatibility hack around
        # https://github.com/pytorch/pytorch/commit/4404762d7dd955383acee92e6f06b48144a0742e
        version_ge_1_1 = []
@@ -48,19 +70,21 @@ if "--cuda_ext" in sys.argv:
        ext_modules.append(
            CUDAExtension(name='amp_C',
                          sources=['csrc/amp_C_frontend.cpp',
-                                   'csrc/scale_check_overflow_kernel.cu',
+                                   'csrc/multi_tensor_sgd_kernel.cu',
                                   'csrc/multi_tensor_scale_kernel.cu',
-                                   'csrc/multi_tensor_sgd_kernel.cu'],
+                                   'csrc/multi_tensor_axpby_kernel.cu',
+                                   'csrc/multi_tensor_l2norm_kernel.cu'],
                          extra_compile_args={'cxx': ['-O3'],
                                              'nvcc':['-lineinfo',
                                                      '-O3',
+                                                      # '--resource-usage',
                                                      '--use_fast_math']}))
        ext_modules.append(
            CUDAExtension(name='fused_adam_cuda',
-                          sources=['apex/optimizers/csrc/fused_adam_cuda.cpp',
+                          sources=['csrc/fused_adam_cuda.cpp',
-                                   'apex/optimizers/csrc/fused_adam_cuda_kernel.cu'],
+                                   'csrc/fused_adam_cuda_kernel.cu'],
                          extra_compile_args={'cxx': ['-O3',],
-                                              'nvcc':['-O3', 
+                                              'nvcc':['-O3',
                                                      '--use_fast_math']}))
        ext_modules.append(
            CUDAExtension(name='syncbn',
@@ -68,11 +92,11 @@ if "--cuda_ext" in sys.argv:
                                   'csrc/welford.cu']))
        ext_modules.append(
            CUDAExtension(name='fused_layer_norm_cuda',
-                          sources=['apex/normalization/csrc/layer_norm_cuda.cpp',
+                          sources=['csrc/layer_norm_cuda.cpp',
-                                   'apex/normalization/csrc/layer_norm_cuda_kernel.cu'],
+                                   'csrc/layer_norm_cuda_kernel.cu'],
                          extra_compile_args={'cxx': ['-O3'] + version_ge_1_1,
                                              'nvcc':['-maxrregcount=50',
-                                                      '-O3', 
+                                                      '-O3',
                                                      '--use_fast_math'] + version_ge_1_1}))
 print(ext_modules)

--- a/tests/L0/run_amp/test_cache.py
+++ b/tests/L0/run_amp/test_cache.py
@@ -4,6 +4,7 @@ import functools as ft
 import itertools as it
 from apex import amp
+from apex.amp import _amp_state
 import torch
 from torch import nn
 import torch.nn.functional as F
@@ -60,24 +61,27 @@ class PromoteModule(torch.nn.Module):
 class TestCache(unittest.TestCase):
    def setUp(self):
-        self.handle = amp.init(enabled=True)
        self.x = torch.ones((2, 8), device='cuda', dtype=torch.float32)
        common_init(self)
    def tearDown(self):
-        self.handle._deactivate()
+        pass
    def train_eval_train_test(self, module, t):
        model = module(t).cuda()
-        dummy_optimizer = torch.optim.SGD(model.parameters(), lr=1.0)
+        optimizer = torch.optim.SGD(model.parameters(), lr=1.0)
+        _amp_state.allow_incoming_model_not_fp32 = True
+        model, optimizer = amp.initialize(model, optimizer, opt_level="O1", verbosity=0)
+        _amp_state.allow_incoming_model_not_fp32 = False
        def training_step():
            for param in model.parameters():
                param.grad = None
            loss = model(self.x).sum()
-            self.handle._default_scaler._loss_scale = 4.0
+            _amp_state.loss_scalers[0]._loss_scale = 4.0
-            with self.handle.scale_loss(loss, dummy_optimizer) as scaled_loss:
+            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
            self.assertEqual(len([p.grad for p in model.parameters() if p.grad is not None]), 1)
@@ -105,6 +109,8 @@ class TestCache(unittest.TestCase):
        # Simulates resuming training after eval
        training_step()
+        _amp_state.handle._deactivate()
    # I could easily have these as a set of for loops in a single test,
    # instead of going for granularity.

--- a/tests/L0/run_amp/test_multi_tensor_axpby.py
+++ b/tests/L0/run_amp/test_multi_tensor_axpby.py
+import unittest
+import functools as ft
+import itertools as it
+from apex import amp
+import torch
+from torch import nn
+import torch.nn.functional as F
+from utils import common_init, HALF, FLOAT,\
+    ALWAYS_HALF, ALWAYS_FLOAT, MATCH_INPUT
+try:
+  import amp_C
+  from amp_C import multi_tensor_axpby
+  from apex.multi_tensor_apply import MultiTensorApply
+  disabled = False
+except ImportError as err:
+  print("amp_C fused kernels unavailable, disabling TestMultiTensorApply.  ImportError was ", err)
+  disabled = True
+class TestMultiTensorAxpby(unittest.TestCase):
+    def setUp(self):
+        common_init(self)
+        self.a = 2.0
+        self.b = 8.0
+        self.xval = 4.0
+        self.yval = 16.0
+        self.overflow_buf = torch.cuda.IntTensor(1).zero_()
+        self.ref = torch.cuda.FloatTensor([136.0])
+    def tearDown(self):
+        pass
+    # The tensor creation here is written for convenience, not speed.
+    def axpby(self, sizea, sizeb, applier, repeat_tensors,
+              x_type, y_type, out_type, inplace=False):
+        self.overflow_buf.zero_()
+        t1 = torch.cuda.FloatTensor(sizea).fill_(1.0)
+        t2 = torch.cuda.FloatTensor(sizeb).fill_(1.0)
+        y_list = []
+        for i in range(repeat_tensors):
+            y_list += [t1.clone().to(y_type)*self.yval, t2.clone().to(y_type)*self.yval]
+        x_list = [x.clone().to(x_type)*(self.xval/self.yval) for x in y_list]
+        if inplace:
+            out_list = y_list
+        else:
+            out_list = [out.clone().to(out_type)*3.0 for out in y_list]
+        applier(multi_tensor_axpby, self.overflow_buf, [x_list, y_list, out_list], self.a, self.b, -1)
+        self.assertTrue(all([torch.allclose(out, self.ref.to(out_type)) for out in out_list]),
+                        msg="{} {} {} {} {} {} {}".format(sizea, sizeb, repeat_tensors,
+                        x_type, y_type, out_type, inplace))
+        self.assertTrue(self.overflow_buf.item() == 0,
+                        msg="{} {} {} {} {} {} {}".format(sizea, sizeb, repeat_tensors,
+                        x_type, y_type, out_type, inplace))
+    # def find_inf(self, sizea, sizeb, applier, repeat_tensors, in_type, out_type, t, ind, val, inplace=False):
+    #     self.overflow_buf.zero_()
+    #     a = torch.cuda.FloatTensor(sizea).fill_(self.scale)
+    #     b = torch.cuda.FloatTensor(sizeb).fill_(self.scale)
+    #     out_list = []
+    #     for i in range(repeat_tensors):
+    #         out_list += [a.clone().to(out_type), b.clone().to(out_type)]
+    #     if inplace:
+    #         in_list = out_list
+    #     else:
+    #         in_list = [out.clone().to(in_type) for out in out_list]
+    #     applier(multi_tensor_scale, self.overflow_buf, [in_list, out_list], 1./self.scale)
+    #     self.overflow_buf.zero_()
+    #     in_list[t][ind] = val
+    #     applier(multi_tensor_scale, self.overflow_buf, [in_list, out_list], 1./self.scale)
+    #     self.assertTrue(self.overflow_buf.item())
+    @unittest.skipIf(disabled, "amp_C is unavailable")
+    def test_fuzz(self):
+        input_size_pairs = (
+            (7777*77, 555*555),
+            (777, 555),
+            (555, 2048*32+1),
+            (2048*32+1, 555),
+            (555, 2048*32),
+            (2048*32, 555),
+            (33333, 555),
+            (555, 33333))
+        appliers = (
+            MultiTensorApply(2048*32),
+            MultiTensorApply(333),
+            MultiTensorApply(33333))
+        repeat_tensors = (
+            1,
+            55)
+        for sizea, sizeb in input_size_pairs:
+          for applier in appliers:
+            for repeat in repeat_tensors:
+              for x_type in (torch.float32, torch.float16):
+                for y_type in (torch.float32, torch.float16):
+                  for out_type in (torch.float32, torch.float16):
+                    for inplace in (True, False):
+                      if inplace is True and (y_type is not out_type):
+                        continue
+                      else:
+                        self.axpby(sizea, sizeb, applier, repeat,
+                                   x_type, y_type, out_type, inplace=inplace)
+                      # self.find_inf(sizea, sizeb, applier, repeat, in_type, out_type,
+                      #               0, 0, float('nan'), inplace=inplace)
+                      # self.find_inf(sizea, sizeb, applier, repeat, in_type, out_type,
+                      #               2*repeat-1, sizeb-1, float('inf'), inplace=inplace)
+                      # self.find_inf(sizea, sizeb, applier, repeat, in_type, out_type,
+                      #              2*(repeat//2), sizea//2, float('inf'), inplace=inplace)
+if __name__ == '__main__':
+    unittest.main()