Cleaned comments in fp16_utils and csrc. Keeping comments that are non-docstring but informative.

a3e2776a · Michael Carilli · a288d585 · a3e2776a · a3e2776a · a3e2776a
Commit a3e2776a authored Apr 25, 2018 by Michael Carilli
14 changed files
--- a/apex/fp16_utils/fp16_optimizer.py
+++ b/apex/fp16_utils/fp16_optimizer.py
@@ -204,7 +204,7 @@ class FP16_Optimizer(object):
        # if name in ['state', 'param_groups']:
        #     return self.optimizer.__dict__[name],
        # but this would bypass self.optimizer's custom getters and setters, if it chose to define any.
-        # I could also use properties, as for loss_scale, but I have no idea if properties bypass 
+        # I could also use properties, as for loss_scale, but I don't know if properties bypass 
        # self.optimizer's custom getters and setters.
        if name == 'state':
            return self.optimizer.state
@@ -225,7 +225,11 @@ class FP16_Optimizer(object):
        """
        Zero fp32 and fp16 parameter grads.
        """
+        # In principle, only the .grad attributes of the model params need to be zeroed,
+        # because gradients are copied into the FP32 master params.  However, we zero
+        # all gradients owned by the optimizer, just to be safe:
        self.optimizer.zero_grad()
+        # Zero fp16 gradients owned by the model:
        for fp16_group in self.fp16_groups:
            for param in fp16_group:
                if param.grad is not None:
@@ -245,16 +249,14 @@ class FP16_Optimizer(object):
    def _update_scale(self, has_overflow=False):
        self.loss_scaler.update_scale(has_overflow)

-    # TODO:  Register a hook on each variable to do the overflow check, gradient copy + downscale,
-    # fp32 allreduce for distributed in a different stream.  Debatable which ops should be 
-    # treated that way, but it'll be fun to play with.
+    # To consider:  Integrate distributed with this wrapper by registering a hook on each variable 
+    # that does the overflow check, gradient copy + downscale, and fp32 allreduce in a different stream.
    def _model_grads_to_master_grads(self):
        for fp16_group, fp32_from_fp16_group in zip(self.fp16_groups, self.fp32_from_fp16_groups):
            model_grads_to_master_grads(fp16_group, fp32_from_fp16_group)

    def _downscale_master(self):
        if self.loss_scale != 1.0: 
-            # print("downscaling fp32 gradients")
            for group in self.optimizer.param_groups:
                for param in group['params']:
                    param.grad.data.mul_(1./self.loss_scale)
@@ -334,7 +336,7 @@ class FP16_Optimizer(object):
        self.optimizer.load_state_dict(state_dict['optimizer_state_dict'])
        # At this point, the optimizer's references to the model's fp32 parameters are up to date.
        # The optimizer's hyperparameters and internal buffers are also up to date.  
-        # However, the fp32 master copies of the model's fp16 params stored by the optimizer are now
+        # However, the fp32 master copies of the model's fp16 params stored by the optimizer are still
        # out of date.  There are two options.  
        # 1:  Refresh the master params from the model's fp16 params.  
        # This requires less storage but incurs precision loss.
@@ -342,7 +344,7 @@ class FP16_Optimizer(object):
        # We choose option 2.
        # 
        # Pytorch Optimizer.load_state_dict casts saved buffers (e.g. momentum) to the type and device 
-        # of # their associated parameters, because it's possible those buffers might not exist yet in 
+        # of their associated parameters, because it's possible those buffers might not exist yet in 
        # the current optimizer instance.  In our case, as long as the current FP16_Optimizer has been 
        # constructed in the same way as the one whose state_dict we are loading, the same master params
        # are guaranteed to exist, so we can just copy_() from the saved master params.
@@ -423,12 +425,11 @@ class FP16_Optimizer(object):
                self._master_params_to_model_params()
            # Our API expects the user to give us ownership of the backward() call by
            # replacing all calls to loss.backward() with optimizer.backward(loss).
-            # This requirement holds whether or not the call to backward() is made within
-            # a closure.
+            # This requirement holds whether or not the call to backward() is made within a closure.
            # If the user is properly calling optimizer.backward(loss) within "closure," 
            # calling closure() here will give the fp32 master params fresh gradients
-            # for the optimizer to play with, 
-            # so all wrapped_closure needs to do is call closure() and return the loss.
+            # for the optimizer to play with, so all wrapped_closure needs to do is call 
+            # closure() and return the loss.
            temp_loss = closure() 
            return temp_loss

@@ -496,11 +497,10 @@ class FP16_Optimizer(object):
            optimizer.backward(loss2, update_master_grads=False)
            optimizer.update_master_grads()
        """ 
-        # To think about:  try multiple backward passes using retain_grad=True to find 
+        # To consider:  try multiple backward passes using retain_grad=True to find 
        # a loss scale that works.  After you find a loss scale that works, do a final dummy
-        # backward pass with retain_graph=False to tear down the graph.
-        # Doing this would avoid discarding the iteration,  but probably wouldn't 
-        # improve overall efficiency.  
+        # backward pass with retain_graph=False to tear down the graph.  Doing this would avoid 
+        # discarding the iteration,  but probably wouldn't improve overall efficiency.  
        self.loss_scaler.backward(loss.float())
        if update_master_grads:
            self.update_master_grads()

--- a/apex/fp16_utils/fp16util.py
+++ b/apex/fp16_utils/fp16util.py
@@ -45,7 +45,7 @@ def backwards_debug_hook(grad):
    raise RuntimeError("master_params recieved a gradient in the backward pass!")

 def prep_param_lists(model, flat_master=False):
-    r"""
+    """
    Creates a list of FP32 master parameters for a given model, as in 
    `Training Neural Networks with Mixed Precision:  Real Examples`_.


--- a/apex/fp16_utils/fused_weight_norm.py
+++ b/apex/fp16_utils/fused_weight_norm.py
@@ -42,7 +42,7 @@ class Fused_Weight_Norm(Function):
        output = input.new(input.size()).contiguous()

        """
-        For output with size (slow, faster, faster, ...fastest), we may want
+        For output with size (slow, faster, faster, ...fastest), we want
        norms with size (slow, 1, 1, ...1), so that if you want retrieve norms 
        and apply the same normalizing factors to another Tensor "t" with the 
        same size as output, "t/norms" will broadcast each element of norms 
@@ -88,7 +88,7 @@ class Fused_Weight_Norm(Function):
        savedInput, savedg = ctx.saved_tensors
        savedNorms = ctx.norms

-        # better safe than sorry
+        # We expect that these .contiguous() calls will be no-ops.  They're present for safety.
        grad_output_contig = grad_output.contiguous()

        grad_input = grad_output_contig.new(grad_output.size()).contiguous()

--- a/apex/fp16_utils/loss_scaler.py
+++ b/apex/fp16_utils/loss_scaler.py
-"""
-Top of loss_scaler.py stub.  Can't figure out a way to get the module file
-highlighted in a pretty way, or link back to source.
-"""
 import torch

 # item() is a recent addition, so this helps with backward compatibility.
@@ -18,6 +14,9 @@ class LossScaler:

    Use of LossScaler is enabled via the ``static_loss_scale`` argument to 
    :class:`FP16_Optimizer`'s constructor.
+
+    Args:
+        scale (float, optional, default=1.0):  The loss scale.
    """

    def __init__(self, scale=1):
@@ -31,7 +30,6 @@ class LossScaler:
    def _has_inf_or_nan(x):
        return False

-    # `overflow` is boolean indicating whether we overflowed in gradient
    def update_scale(self, overflow):
        pass

@@ -92,10 +90,12 @@ class DynamicLossScaler:
    # `x` is a torch.Tensor
    def _has_inf_or_nan(x):
        try:
-            # Stopgap until upstream fixes sum() on HalfTensors
+            # if x is half, the .float() incurs an additional deep copy, but it's necessary if 
+            # Pytorch's .sum() creates a one-element tensor of the same type as x 
+            # (which is true for some recent version of pytorch).
            cpu_sum = float(x.float().sum())
+            # More efficient version that can be used if .sum() returns a Python scalar
            # cpu_sum = float(x.sum())
-            # print(cpu_sum)
        except RuntimeError as instance:
            # We want to check if inst is actually an overflow exception.
            # RuntimeError could come from a different error.
@@ -108,7 +108,7 @@ class DynamicLossScaler:
                return True
            return False

-    # `overflow` is boolean indicating whether we overflowed in gradient
+    # `overflow` is boolean indicating whether the gradient overflowed
    def update_scale(self, overflow):
        if overflow:
            # self.cur_scale /= self.scale_factor

--- a/csrc/Module.cpp
+++ b/csrc/Module.cpp
@@ -53,7 +53,7 @@ TensorInfo<void, idxType> PyOb_2_tinfo(PyObject* tensor, float_types data_type)
  void* data_ptr = (void*) PyLong_AsLong(PyDataPtr);
  Py_ssize_t ndims = PyList_GET_SIZE(PySizes);

-  //TODO put proper checking on ndims < MAX_CUTORCH_DIMS
+  // TODO put proper checking on ndims < MAX_CUTORCH_DIMS
  idxType strides[MAX_CUTORCH_DIMS], sizes[MAX_CUTORCH_DIMS];

  for(int i = 0; i < ndims; i++)
@@ -62,8 +62,6 @@ TensorInfo<void, idxType> PyOb_2_tinfo(PyObject* tensor, float_types data_type)
    sizes[i] = PyLong_AsLong(PyTuple_GetItem(PySizes, i));
  }

-  // Reference counts still behave strangely, but at least these appear to cap 
-  // the process' memory usage.
  Py_DECREF(PyStrides);
  Py_DECREF(PySizes);
  Py_DECREF(PyDataPtr);
@@ -95,10 +93,6 @@ vector<TensorInfo<void, idxType> > get_TInfos(PyObject* args)
 	cout << "For args item " << i << ", pyObjTypeCall = NULL" << endl;
      }

-      // This gives a segfault:
-      // cout << "pyObjTypeCall direct conversion attempt = " << 
-      //          PyBytes_AsString(pyObjTypeCall) << endl;
-
      PyObject* pyObjASCII = PyUnicode_AsASCIIString(pyObjTypeCall);
      if(pyObjASCII == NULL)
      {
@@ -106,12 +100,10 @@ vector<TensorInfo<void, idxType> > get_TInfos(PyObject* args)
 	cout << "For args item " << i << ", pyObjASCII = NULL " << endl;
      }

-      // cout << "Py_REFCNT(pyObjTypeCall) = " << Py_REFCNT(pyObjTypeCall) << endl;
      Py_DECREF(pyObjTypeCall);

      string objTypeCall(PyBytes_AsString(pyObjASCII));

-      // cout << "Py_REFCNT(pyObjASCII) = " << Py_REFCNT(pyObjASCII) << endl;
      Py_DECREF(pyObjASCII);

 #ifdef DEBUG_ANY
@@ -131,7 +123,7 @@ vector<TensorInfo<void, idxType> > get_TInfos(PyObject* args)
 #endif
      else if(objTypeCall == "torch.cuda.HalfTensor")
 	info_vec.push_back(PyOb_2_tinfo(pyTensor, HALF));
-      // Could add double
+      // TODO add double
      else
      {
 	ERROR_MSG;
@@ -148,8 +140,6 @@ vector<TensorInfo<void, idxType> > get_TInfos(PyObject* args)
      }
    }

-  // PyErr_SetString(PyExc_RuntimeError, "Exception set in  ");
-
  return info_vec;
 }

@@ -185,8 +175,8 @@ void dispatch
  }
 }

-//Will extract all tensors in order. Assumes flat structure, tensors can not be wrapped in lists
-//tuples or any other iterator structure.
+// Will extract all tensors in order. Assumes flat structure, tensors can not be wrapped in lists
+// tuples or any other iterator structure.
 static PyObject* weight_norm_fwd(PyObject* self, PyObject* args)
 {
 #ifdef USE_NVTX

--- a/csrc/kernel.cu
+++ b/csrc/kernel.cu
@@ -22,7 +22,7 @@ template<> struct TtoInt<double> { static const int test = 0; };
 // Dialing up the block size to, say 1024, can improve performance by
 // increase the amount of cache available per block, which can improve cache hit rate.
 // However, this is less efficient for short rows.  256 is pretty versatile. 
-// Implement some heuristics later?
+// May be worth implementing heuristics later.
 #define BLOCK 256

 // Block size for weight_norm_*_last_dim_kernel.
@@ -160,15 +160,13 @@ __global__ void weight_norm_fwd_last_dim_kernel

  reduce_block_into_lanes(s, thread_sum, blockDim.x); 

-  // Better to pass an EpilogueOp to reduce_block_into_lanes, can try later
+  // Better to pass an EpilogueOp to reduce_block_into_lanes, implement later
  if(threadIdx.y == 0)
  {
    float result = s[threadIdx.x];
    float norm_this_col = sqrtf(result);
    DEVICE_LINEAR_GET_F(norms, fast_dim_location) = norm_this_col;
    rnorms_this_block[threadIdx.x] = 1.f/norm_this_col;
-    // printf("blockIdx.x = %d, threadIdx.x = %d, norm_this_col  = %f\n", 
-    //         blockIdx.x,      threadIdx.x,      norm_this_col);
  }
   
  __syncthreads(); 
@@ -373,7 +371,6 @@ void send_to_fwd_wrapper::call
 #endif
 }

-// template <typename T, typename IndexType>
 template<typename DataType,
         typename AccumType,
         typename IndexType>
@@ -387,7 +384,6 @@ void send_to_bwd_wrapper::call
  cout << "Hello from send_to_bwd with pLpw.type = " << pLpw.type << endl;
 #endif

-  // this feels sinful
  auto pLpv      (*((TensorInfo<DataType , idxType>*)&tensors[0]));
  auto pLpg      (*((TensorInfo<DataType , idxType>*)&tensors[1]));
  auto pLpw      (*((TensorInfo<DataType , idxType>*)&tensors[2]));

--- a/docs/source/fp16_utils.rst
+++ b/docs/source/fp16_utils.rst
@@ -20,8 +20,8 @@ For Pytorch users, Real Examples in particular is recommended.
 .. automodule:: apex.fp16_utils
 .. currentmodule:: apex.fp16_utils

-.. FusedNorm
-   ----------
+Manual master parameter management
+----------------------------------

 .. autofunction:: prep_param_lists

@@ -29,22 +29,21 @@ For Pytorch users, Real Examples in particular is recommended.

 .. autofunction:: model_grads_to_master_grads

-.. autoclass:: FP16_Optimizer
-    :members:
+Automatic management of master params + loss scaling
+----------------------------------------------------

-.. autoclass:: Fused_Weight_Norm
+.. autoclass:: FP16_Optimizer
    :members:

-.. .. automodule:: apex.fp16_utils.loss_scaler
-
 .. autoclass:: LossScaler
    :members:

 .. autoclass:: DynamicLossScaler
    :members:

-.. .. automodule:: apex.fp16_utils.fp16util
-    :members:
-

+Custom Operations
+-----------------

+.. autoclass:: Fused_Weight_Norm
+    :members:
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -8,7 +8,7 @@
 APEx (A PyTorch Extension)
 ===================================

-This is a repo is designed to hold PyTorch modules and utilities that are under active development and experimental. This repo is not designed as a long term solution or a production solution. Things placed in here are intended to be eventually moved to upstream PyTorch.
+This is a repo designed to hold PyTorch modules and utilities that are under active development and experimental. This repo is not designed as a long term solution or a production solution. Things placed in here are intended to be eventually moved to upstream PyTorch.

 A major focus of this extension is the training of neural networks using 16-bit precision floating point math, which offers significant performance benefits on latest NVIDIA GPU architectures. The reduced dynamic range of half precision, however, is more vulnerable to numerical overflow/underflow.


--- a/include/THCTensorInfo.cuh
+++ b/include/THCTensorInfo.cuh
@@ -8,8 +8,7 @@
 // Maximum number of dimensions allowed for cutorch
 #define MAX_CUTORCH_DIMS 10

-// Warning string for tensor arguments that are too large or have too
-// many dimensions
+// Warning string for tensor arguments that are too large or have too many dimensions
 #define CUTORCH_STR(X) #X
 #define CUTORCH_DIM_WARNING "tensor too large or too many (>" \
  CUTORCH_STR(MAX_CUTORCH_DIMS) ") dimensions"

--- a/include/kernel.h
+++ b/include/kernel.h
@@ -3,20 +3,12 @@
 #include <cstdio>
 #include <cassert>
 #include <cuda.h>
-// this is suboptimal, try forward declarations later
 #include <vector>

 #define Dims -2
 #define DEVICE_LINEAR_GET(D_TENSOR, INDEX) D_TENSOR.data[IndexToOffset<T, IndexType, Dims>::get(INDEX, D_TENSOR)]
 #define DEVICE_LINEAR_GET_F(D_TENSOR, INDEX) D_TENSOR.data[IndexToOffset<float, IndexType, Dims>::get(INDEX, D_TENSOR)]

-// template <typename T, typename IndexType>
-// void send_to_kernel(
-//                     TensorInfo<T, IndexType> Input_1,
-//                     TensorInfo<T, IndexType> Input_2,
-//                     IndexType totalElems
-//                     );
-
 typedef int idxType;

 struct send_to_fwd_wrapper

--- a/tests/raw_ops/backward_gotchas.py
+++ b/tests/raw_ops/backward_gotchas.py
-import torch
-from torch.autograd import Variable
-# import apex
-import numpy as np
-
-torch.manual_seed(2)
-torch.cuda.manual_seed(2)
-# torch.cuda.manual_seed_all(2)
-torch.set_printoptions(precision=10)
-
-rows = 3
-cols = 20
-dims = rows, cols
-
-# Incoming gradient vectors we will use later
-# Need to create the fp16 versions as a half() copy of a Tensor first rather than
-# a Variable, because if you create pt_input_control as a Variable then say
-# pt_input_fp16 = pt_input_control.half(), you are accidentally making pt_input_fp16 part of 
-# pLpOutput_control's computational graph, so it will not be a leaf!
-pt_input_control = Variable(torch.randn(*dims).cuda(), requires_grad=True)
-# pt_input_control = torch.ones(*dims).cuda()
-pt_input_fp16    = pt_input_control.half()
-
-pt_output_fp16    = pt_input_fp16.sum()
-pt_output_control = pt_input_control.sum()
-print("After sum()s, before backwards:")
-print("pt_output_control.requires_grad = ", pt_output_control.requires_grad)
-print("pt_output_control.volatile = ", pt_output_control.volatile)
-print("pt_input_control.grad = ", pt_input_control.grad)
-print("pt_input_fp16.grad = ", pt_input_fp16.grad)
-print("\n\n")
-
-pt_output_fp16.backward() # pt_input_fp16 is not the leaf of this graph, pt_input_control is.
-print("After pt_output_fp16.backward():")
-print("pt_input_control.grad = ", pt_input_control.grad)
-print("pt_input_fp16.grad = ", pt_input_fp16.grad)
-print("\n\n")
-pt_output_control.backward() # Both backward() calls have pt_input_control as leaves, and so
-                             # will accumulate gradients into pt_input_control.grad
-print("After pt_output_control.backward():")
-print("pt_input_control.grad = ", pt_input_control.grad)
-print("pt_input_fp16.grad = ", pt_input_fp16.grad)
-print("\n\n")
-print("pt_output_control = ", pt_output_control)
-print("pt_output_fp16 = ", pt_output_fp16)
-
-
--- a/tests/raw_ops/test_autograd.py
+++ b/tests/raw_ops/test_autograd.py
@@ -32,8 +32,8 @@ RAND      = True # If false, input gradients (the result of the backward pass)
 # where gradients are computed by calling backward() on a scalar Loss.

 if RAND:
-    # With std=6.0, I observe the pytorch fp16 ops going unstable
-    # while the fused kernel remains stable (sometimes).
+    # With std=6.0, I observe the pytorch fp16 ops going unstable (sometimes)
+    # while the fused kernel remains stable.
    pt_in_fp32       = torch.cuda.FloatTensor(*dims      ).normal_(std=1.0)
    norm_shape = get_norm_shape(pt_in_fp32, dim)
    pt_g_fp32        = torch.cuda.FloatTensor(*norm_shape).normal_(std=1.0)

--- a/tests/raw_ops/test_backward.py
+++ b/tests/raw_ops/test_backward.py
@@ -18,8 +18,8 @@ sizes = [
    # (3, 1024, 2048),
    # (1, 1024, 4096),
    # (1, 2048, 8192),
-    # (1, 4096, 4096), # this is not one of natalia's sizes, just a reference benchmark.
-    # (4096, 4096, 1), # this is not one of natalia's sizes, just a reference benchmark.
+    # (1, 4096, 4096), # this is not one of the fairseq sizes, just a reference benchmark.
+    # (4096, 4096, 1), # this is not one of the fairseq sizes, just a reference benchmark.
    ]

 # rows = 3
@@ -37,7 +37,7 @@ for rows, cols, fast in sizes:
    # pt_input_fp16 = pt_input_control.half(), you are accidentally making pt_input_fp16 part of 
    # pLpOutput_control's computational graph, instead of the leaf of its own separate graph.
    
-    # Careful: if you initialize with torch.ones, the gradient wrt input becomes analytically zero :P
+    # Careful: if you initialize with torch.ones, the gradient wrt input becomes analytically zero.
    if RAND:
        pLpOutput_control = torch.cuda.FloatTensor(*dims      ).uniform_()*1.0 
        norm_shape = get_norm_shape(pLpOutput_control, dim)

--- a/tests/raw_ops/test_forward.py
+++ b/tests/raw_ops/test_forward.py
@@ -19,9 +19,9 @@ sizes = [
    # (3, 1024, 2048),
    # (1, 1024, 4096),
    # (1, 2048, 8192),
-    # (1, 4096, 4096), # this is not one of natalia's sizes, just a reference benchmark.
-    (4096, 4096, 1), # this is not one of natalia's sizes, just a reference benchmark.
-    # (353, 55, 353), # this is not one of natalia's sizes, just a reference benchmark.
+    # (1, 4096, 4096), # this is not one of the fairseq sizes, just a reference benchmark.
+    (4096, 4096, 1), # this is not one the fairseq sizes, just a reference benchmark.
+    # (353, 55, 353), # this is not one of the fairseq sizes, just a reference benchmark.
    ]

 # rows = 3