Commit a3e2776a authored by Michael Carilli's avatar Michael Carilli
Browse files

Cleaned comments in fp16_utils and csrc. Keeping comments that are non-docstring but informative.

parent a288d585
...@@ -204,7 +204,7 @@ class FP16_Optimizer(object): ...@@ -204,7 +204,7 @@ class FP16_Optimizer(object):
# if name in ['state', 'param_groups']: # if name in ['state', 'param_groups']:
# return self.optimizer.__dict__[name], # return self.optimizer.__dict__[name],
# but this would bypass self.optimizer's custom getters and setters, if it chose to define any. # but this would bypass self.optimizer's custom getters and setters, if it chose to define any.
# I could also use properties, as for loss_scale, but I have no idea if properties bypass # I could also use properties, as for loss_scale, but I don't know if properties bypass
# self.optimizer's custom getters and setters. # self.optimizer's custom getters and setters.
if name == 'state': if name == 'state':
return self.optimizer.state return self.optimizer.state
...@@ -225,7 +225,11 @@ class FP16_Optimizer(object): ...@@ -225,7 +225,11 @@ class FP16_Optimizer(object):
""" """
Zero fp32 and fp16 parameter grads. Zero fp32 and fp16 parameter grads.
""" """
# In principle, only the .grad attributes of the model params need to be zeroed,
# because gradients are copied into the FP32 master params. However, we zero
# all gradients owned by the optimizer, just to be safe:
self.optimizer.zero_grad() self.optimizer.zero_grad()
# Zero fp16 gradients owned by the model:
for fp16_group in self.fp16_groups: for fp16_group in self.fp16_groups:
for param in fp16_group: for param in fp16_group:
if param.grad is not None: if param.grad is not None:
...@@ -245,16 +249,14 @@ class FP16_Optimizer(object): ...@@ -245,16 +249,14 @@ class FP16_Optimizer(object):
def _update_scale(self, has_overflow=False): def _update_scale(self, has_overflow=False):
self.loss_scaler.update_scale(has_overflow) self.loss_scaler.update_scale(has_overflow)
# TODO: Register a hook on each variable to do the overflow check, gradient copy + downscale, # To consider: Integrate distributed with this wrapper by registering a hook on each variable
# fp32 allreduce for distributed in a different stream. Debatable which ops should be # that does the overflow check, gradient copy + downscale, and fp32 allreduce in a different stream.
# treated that way, but it'll be fun to play with.
def _model_grads_to_master_grads(self): def _model_grads_to_master_grads(self):
for fp16_group, fp32_from_fp16_group in zip(self.fp16_groups, self.fp32_from_fp16_groups): for fp16_group, fp32_from_fp16_group in zip(self.fp16_groups, self.fp32_from_fp16_groups):
model_grads_to_master_grads(fp16_group, fp32_from_fp16_group) model_grads_to_master_grads(fp16_group, fp32_from_fp16_group)
def _downscale_master(self): def _downscale_master(self):
if self.loss_scale != 1.0: if self.loss_scale != 1.0:
# print("downscaling fp32 gradients")
for group in self.optimizer.param_groups: for group in self.optimizer.param_groups:
for param in group['params']: for param in group['params']:
param.grad.data.mul_(1./self.loss_scale) param.grad.data.mul_(1./self.loss_scale)
...@@ -334,7 +336,7 @@ class FP16_Optimizer(object): ...@@ -334,7 +336,7 @@ class FP16_Optimizer(object):
self.optimizer.load_state_dict(state_dict['optimizer_state_dict']) self.optimizer.load_state_dict(state_dict['optimizer_state_dict'])
# At this point, the optimizer's references to the model's fp32 parameters are up to date. # At this point, the optimizer's references to the model's fp32 parameters are up to date.
# The optimizer's hyperparameters and internal buffers are also up to date. # The optimizer's hyperparameters and internal buffers are also up to date.
# However, the fp32 master copies of the model's fp16 params stored by the optimizer are now # However, the fp32 master copies of the model's fp16 params stored by the optimizer are still
# out of date. There are two options. # out of date. There are two options.
# 1: Refresh the master params from the model's fp16 params. # 1: Refresh the master params from the model's fp16 params.
# This requires less storage but incurs precision loss. # This requires less storage but incurs precision loss.
...@@ -342,7 +344,7 @@ class FP16_Optimizer(object): ...@@ -342,7 +344,7 @@ class FP16_Optimizer(object):
# We choose option 2. # We choose option 2.
# #
# Pytorch Optimizer.load_state_dict casts saved buffers (e.g. momentum) to the type and device # Pytorch Optimizer.load_state_dict casts saved buffers (e.g. momentum) to the type and device
# of # their associated parameters, because it's possible those buffers might not exist yet in # of their associated parameters, because it's possible those buffers might not exist yet in
# the current optimizer instance. In our case, as long as the current FP16_Optimizer has been # the current optimizer instance. In our case, as long as the current FP16_Optimizer has been
# constructed in the same way as the one whose state_dict we are loading, the same master params # constructed in the same way as the one whose state_dict we are loading, the same master params
# are guaranteed to exist, so we can just copy_() from the saved master params. # are guaranteed to exist, so we can just copy_() from the saved master params.
...@@ -423,12 +425,11 @@ class FP16_Optimizer(object): ...@@ -423,12 +425,11 @@ class FP16_Optimizer(object):
self._master_params_to_model_params() self._master_params_to_model_params()
# Our API expects the user to give us ownership of the backward() call by # Our API expects the user to give us ownership of the backward() call by
# replacing all calls to loss.backward() with optimizer.backward(loss). # replacing all calls to loss.backward() with optimizer.backward(loss).
# This requirement holds whether or not the call to backward() is made within # This requirement holds whether or not the call to backward() is made within a closure.
# a closure.
# If the user is properly calling optimizer.backward(loss) within "closure," # If the user is properly calling optimizer.backward(loss) within "closure,"
# calling closure() here will give the fp32 master params fresh gradients # calling closure() here will give the fp32 master params fresh gradients
# for the optimizer to play with, # for the optimizer to play with, so all wrapped_closure needs to do is call
# so all wrapped_closure needs to do is call closure() and return the loss. # closure() and return the loss.
temp_loss = closure() temp_loss = closure()
return temp_loss return temp_loss
...@@ -496,11 +497,10 @@ class FP16_Optimizer(object): ...@@ -496,11 +497,10 @@ class FP16_Optimizer(object):
optimizer.backward(loss2, update_master_grads=False) optimizer.backward(loss2, update_master_grads=False)
optimizer.update_master_grads() optimizer.update_master_grads()
""" """
# To think about: try multiple backward passes using retain_grad=True to find # To consider: try multiple backward passes using retain_grad=True to find
# a loss scale that works. After you find a loss scale that works, do a final dummy # a loss scale that works. After you find a loss scale that works, do a final dummy
# backward pass with retain_graph=False to tear down the graph. # backward pass with retain_graph=False to tear down the graph. Doing this would avoid
# Doing this would avoid discarding the iteration, but probably wouldn't # discarding the iteration, but probably wouldn't improve overall efficiency.
# improve overall efficiency.
self.loss_scaler.backward(loss.float()) self.loss_scaler.backward(loss.float())
if update_master_grads: if update_master_grads:
self.update_master_grads() self.update_master_grads()
......
...@@ -45,7 +45,7 @@ def backwards_debug_hook(grad): ...@@ -45,7 +45,7 @@ def backwards_debug_hook(grad):
raise RuntimeError("master_params recieved a gradient in the backward pass!") raise RuntimeError("master_params recieved a gradient in the backward pass!")
def prep_param_lists(model, flat_master=False): def prep_param_lists(model, flat_master=False):
r""" """
Creates a list of FP32 master parameters for a given model, as in Creates a list of FP32 master parameters for a given model, as in
`Training Neural Networks with Mixed Precision: Real Examples`_. `Training Neural Networks with Mixed Precision: Real Examples`_.
......
...@@ -42,7 +42,7 @@ class Fused_Weight_Norm(Function): ...@@ -42,7 +42,7 @@ class Fused_Weight_Norm(Function):
output = input.new(input.size()).contiguous() output = input.new(input.size()).contiguous()
""" """
For output with size (slow, faster, faster, ...fastest), we may want For output with size (slow, faster, faster, ...fastest), we want
norms with size (slow, 1, 1, ...1), so that if you want retrieve norms norms with size (slow, 1, 1, ...1), so that if you want retrieve norms
and apply the same normalizing factors to another Tensor "t" with the and apply the same normalizing factors to another Tensor "t" with the
same size as output, "t/norms" will broadcast each element of norms same size as output, "t/norms" will broadcast each element of norms
...@@ -88,7 +88,7 @@ class Fused_Weight_Norm(Function): ...@@ -88,7 +88,7 @@ class Fused_Weight_Norm(Function):
savedInput, savedg = ctx.saved_tensors savedInput, savedg = ctx.saved_tensors
savedNorms = ctx.norms savedNorms = ctx.norms
# better safe than sorry # We expect that these .contiguous() calls will be no-ops. They're present for safety.
grad_output_contig = grad_output.contiguous() grad_output_contig = grad_output.contiguous()
grad_input = grad_output_contig.new(grad_output.size()).contiguous() grad_input = grad_output_contig.new(grad_output.size()).contiguous()
......
"""
Top of loss_scaler.py stub. Can't figure out a way to get the module file
highlighted in a pretty way, or link back to source.
"""
import torch import torch
# item() is a recent addition, so this helps with backward compatibility. # item() is a recent addition, so this helps with backward compatibility.
...@@ -18,6 +14,9 @@ class LossScaler: ...@@ -18,6 +14,9 @@ class LossScaler:
Use of LossScaler is enabled via the ``static_loss_scale`` argument to Use of LossScaler is enabled via the ``static_loss_scale`` argument to
:class:`FP16_Optimizer`'s constructor. :class:`FP16_Optimizer`'s constructor.
Args:
scale (float, optional, default=1.0): The loss scale.
""" """
def __init__(self, scale=1): def __init__(self, scale=1):
...@@ -31,7 +30,6 @@ class LossScaler: ...@@ -31,7 +30,6 @@ class LossScaler:
def _has_inf_or_nan(x): def _has_inf_or_nan(x):
return False return False
# `overflow` is boolean indicating whether we overflowed in gradient
def update_scale(self, overflow): def update_scale(self, overflow):
pass pass
...@@ -92,10 +90,12 @@ class DynamicLossScaler: ...@@ -92,10 +90,12 @@ class DynamicLossScaler:
# `x` is a torch.Tensor # `x` is a torch.Tensor
def _has_inf_or_nan(x): def _has_inf_or_nan(x):
try: try:
# Stopgap until upstream fixes sum() on HalfTensors # if x is half, the .float() incurs an additional deep copy, but it's necessary if
# Pytorch's .sum() creates a one-element tensor of the same type as x
# (which is true for some recent version of pytorch).
cpu_sum = float(x.float().sum()) cpu_sum = float(x.float().sum())
# More efficient version that can be used if .sum() returns a Python scalar
# cpu_sum = float(x.sum()) # cpu_sum = float(x.sum())
# print(cpu_sum)
except RuntimeError as instance: except RuntimeError as instance:
# We want to check if inst is actually an overflow exception. # We want to check if inst is actually an overflow exception.
# RuntimeError could come from a different error. # RuntimeError could come from a different error.
...@@ -108,7 +108,7 @@ class DynamicLossScaler: ...@@ -108,7 +108,7 @@ class DynamicLossScaler:
return True return True
return False return False
# `overflow` is boolean indicating whether we overflowed in gradient # `overflow` is boolean indicating whether the gradient overflowed
def update_scale(self, overflow): def update_scale(self, overflow):
if overflow: if overflow:
# self.cur_scale /= self.scale_factor # self.cur_scale /= self.scale_factor
......
...@@ -53,7 +53,7 @@ TensorInfo<void, idxType> PyOb_2_tinfo(PyObject* tensor, float_types data_type) ...@@ -53,7 +53,7 @@ TensorInfo<void, idxType> PyOb_2_tinfo(PyObject* tensor, float_types data_type)
void* data_ptr = (void*) PyLong_AsLong(PyDataPtr); void* data_ptr = (void*) PyLong_AsLong(PyDataPtr);
Py_ssize_t ndims = PyList_GET_SIZE(PySizes); Py_ssize_t ndims = PyList_GET_SIZE(PySizes);
//TODO put proper checking on ndims < MAX_CUTORCH_DIMS // TODO put proper checking on ndims < MAX_CUTORCH_DIMS
idxType strides[MAX_CUTORCH_DIMS], sizes[MAX_CUTORCH_DIMS]; idxType strides[MAX_CUTORCH_DIMS], sizes[MAX_CUTORCH_DIMS];
for(int i = 0; i < ndims; i++) for(int i = 0; i < ndims; i++)
...@@ -62,8 +62,6 @@ TensorInfo<void, idxType> PyOb_2_tinfo(PyObject* tensor, float_types data_type) ...@@ -62,8 +62,6 @@ TensorInfo<void, idxType> PyOb_2_tinfo(PyObject* tensor, float_types data_type)
sizes[i] = PyLong_AsLong(PyTuple_GetItem(PySizes, i)); sizes[i] = PyLong_AsLong(PyTuple_GetItem(PySizes, i));
} }
// Reference counts still behave strangely, but at least these appear to cap
// the process' memory usage.
Py_DECREF(PyStrides); Py_DECREF(PyStrides);
Py_DECREF(PySizes); Py_DECREF(PySizes);
Py_DECREF(PyDataPtr); Py_DECREF(PyDataPtr);
...@@ -95,10 +93,6 @@ vector<TensorInfo<void, idxType> > get_TInfos(PyObject* args) ...@@ -95,10 +93,6 @@ vector<TensorInfo<void, idxType> > get_TInfos(PyObject* args)
cout << "For args item " << i << ", pyObjTypeCall = NULL" << endl; cout << "For args item " << i << ", pyObjTypeCall = NULL" << endl;
} }
// This gives a segfault:
// cout << "pyObjTypeCall direct conversion attempt = " <<
// PyBytes_AsString(pyObjTypeCall) << endl;
PyObject* pyObjASCII = PyUnicode_AsASCIIString(pyObjTypeCall); PyObject* pyObjASCII = PyUnicode_AsASCIIString(pyObjTypeCall);
if(pyObjASCII == NULL) if(pyObjASCII == NULL)
{ {
...@@ -106,12 +100,10 @@ vector<TensorInfo<void, idxType> > get_TInfos(PyObject* args) ...@@ -106,12 +100,10 @@ vector<TensorInfo<void, idxType> > get_TInfos(PyObject* args)
cout << "For args item " << i << ", pyObjASCII = NULL " << endl; cout << "For args item " << i << ", pyObjASCII = NULL " << endl;
} }
// cout << "Py_REFCNT(pyObjTypeCall) = " << Py_REFCNT(pyObjTypeCall) << endl;
Py_DECREF(pyObjTypeCall); Py_DECREF(pyObjTypeCall);
string objTypeCall(PyBytes_AsString(pyObjASCII)); string objTypeCall(PyBytes_AsString(pyObjASCII));
// cout << "Py_REFCNT(pyObjASCII) = " << Py_REFCNT(pyObjASCII) << endl;
Py_DECREF(pyObjASCII); Py_DECREF(pyObjASCII);
#ifdef DEBUG_ANY #ifdef DEBUG_ANY
...@@ -131,7 +123,7 @@ vector<TensorInfo<void, idxType> > get_TInfos(PyObject* args) ...@@ -131,7 +123,7 @@ vector<TensorInfo<void, idxType> > get_TInfos(PyObject* args)
#endif #endif
else if(objTypeCall == "torch.cuda.HalfTensor") else if(objTypeCall == "torch.cuda.HalfTensor")
info_vec.push_back(PyOb_2_tinfo(pyTensor, HALF)); info_vec.push_back(PyOb_2_tinfo(pyTensor, HALF));
// Could add double // TODO add double
else else
{ {
ERROR_MSG; ERROR_MSG;
...@@ -148,8 +140,6 @@ vector<TensorInfo<void, idxType> > get_TInfos(PyObject* args) ...@@ -148,8 +140,6 @@ vector<TensorInfo<void, idxType> > get_TInfos(PyObject* args)
} }
} }
// PyErr_SetString(PyExc_RuntimeError, "Exception set in ");
return info_vec; return info_vec;
} }
...@@ -185,8 +175,8 @@ void dispatch ...@@ -185,8 +175,8 @@ void dispatch
} }
} }
//Will extract all tensors in order. Assumes flat structure, tensors can not be wrapped in lists // Will extract all tensors in order. Assumes flat structure, tensors can not be wrapped in lists
//tuples or any other iterator structure. // tuples or any other iterator structure.
static PyObject* weight_norm_fwd(PyObject* self, PyObject* args) static PyObject* weight_norm_fwd(PyObject* self, PyObject* args)
{ {
#ifdef USE_NVTX #ifdef USE_NVTX
......
...@@ -22,7 +22,7 @@ template<> struct TtoInt<double> { static const int test = 0; }; ...@@ -22,7 +22,7 @@ template<> struct TtoInt<double> { static const int test = 0; };
// Dialing up the block size to, say 1024, can improve performance by // Dialing up the block size to, say 1024, can improve performance by
// increase the amount of cache available per block, which can improve cache hit rate. // increase the amount of cache available per block, which can improve cache hit rate.
// However, this is less efficient for short rows. 256 is pretty versatile. // However, this is less efficient for short rows. 256 is pretty versatile.
// Implement some heuristics later? // May be worth implementing heuristics later.
#define BLOCK 256 #define BLOCK 256
// Block size for weight_norm_*_last_dim_kernel. // Block size for weight_norm_*_last_dim_kernel.
...@@ -160,15 +160,13 @@ __global__ void weight_norm_fwd_last_dim_kernel ...@@ -160,15 +160,13 @@ __global__ void weight_norm_fwd_last_dim_kernel
reduce_block_into_lanes(s, thread_sum, blockDim.x); reduce_block_into_lanes(s, thread_sum, blockDim.x);
// Better to pass an EpilogueOp to reduce_block_into_lanes, can try later // Better to pass an EpilogueOp to reduce_block_into_lanes, implement later
if(threadIdx.y == 0) if(threadIdx.y == 0)
{ {
float result = s[threadIdx.x]; float result = s[threadIdx.x];
float norm_this_col = sqrtf(result); float norm_this_col = sqrtf(result);
DEVICE_LINEAR_GET_F(norms, fast_dim_location) = norm_this_col; DEVICE_LINEAR_GET_F(norms, fast_dim_location) = norm_this_col;
rnorms_this_block[threadIdx.x] = 1.f/norm_this_col; rnorms_this_block[threadIdx.x] = 1.f/norm_this_col;
// printf("blockIdx.x = %d, threadIdx.x = %d, norm_this_col = %f\n",
// blockIdx.x, threadIdx.x, norm_this_col);
} }
__syncthreads(); __syncthreads();
...@@ -373,7 +371,6 @@ void send_to_fwd_wrapper::call ...@@ -373,7 +371,6 @@ void send_to_fwd_wrapper::call
#endif #endif
} }
// template <typename T, typename IndexType>
template<typename DataType, template<typename DataType,
typename AccumType, typename AccumType,
typename IndexType> typename IndexType>
...@@ -387,7 +384,6 @@ void send_to_bwd_wrapper::call ...@@ -387,7 +384,6 @@ void send_to_bwd_wrapper::call
cout << "Hello from send_to_bwd with pLpw.type = " << pLpw.type << endl; cout << "Hello from send_to_bwd with pLpw.type = " << pLpw.type << endl;
#endif #endif
// this feels sinful
auto pLpv (*((TensorInfo<DataType , idxType>*)&tensors[0])); auto pLpv (*((TensorInfo<DataType , idxType>*)&tensors[0]));
auto pLpg (*((TensorInfo<DataType , idxType>*)&tensors[1])); auto pLpg (*((TensorInfo<DataType , idxType>*)&tensors[1]));
auto pLpw (*((TensorInfo<DataType , idxType>*)&tensors[2])); auto pLpw (*((TensorInfo<DataType , idxType>*)&tensors[2]));
......
...@@ -20,8 +20,8 @@ For Pytorch users, Real Examples in particular is recommended. ...@@ -20,8 +20,8 @@ For Pytorch users, Real Examples in particular is recommended.
.. automodule:: apex.fp16_utils .. automodule:: apex.fp16_utils
.. currentmodule:: apex.fp16_utils .. currentmodule:: apex.fp16_utils
.. FusedNorm Manual master parameter management
---------- ----------------------------------
.. autofunction:: prep_param_lists .. autofunction:: prep_param_lists
...@@ -29,22 +29,21 @@ For Pytorch users, Real Examples in particular is recommended. ...@@ -29,22 +29,21 @@ For Pytorch users, Real Examples in particular is recommended.
.. autofunction:: model_grads_to_master_grads .. autofunction:: model_grads_to_master_grads
.. autoclass:: FP16_Optimizer Automatic management of master params + loss scaling
:members: ----------------------------------------------------
.. autoclass:: Fused_Weight_Norm .. autoclass:: FP16_Optimizer
:members: :members:
.. .. automodule:: apex.fp16_utils.loss_scaler
.. autoclass:: LossScaler .. autoclass:: LossScaler
:members: :members:
.. autoclass:: DynamicLossScaler .. autoclass:: DynamicLossScaler
:members: :members:
.. .. automodule:: apex.fp16_utils.fp16util
:members:
Custom Operations
-----------------
.. autoclass:: Fused_Weight_Norm
:members:
...@@ -8,7 +8,7 @@ ...@@ -8,7 +8,7 @@
APEx (A PyTorch Extension) APEx (A PyTorch Extension)
=================================== ===================================
This is a repo is designed to hold PyTorch modules and utilities that are under active development and experimental. This repo is not designed as a long term solution or a production solution. Things placed in here are intended to be eventually moved to upstream PyTorch. This is a repo designed to hold PyTorch modules and utilities that are under active development and experimental. This repo is not designed as a long term solution or a production solution. Things placed in here are intended to be eventually moved to upstream PyTorch.
A major focus of this extension is the training of neural networks using 16-bit precision floating point math, which offers significant performance benefits on latest NVIDIA GPU architectures. The reduced dynamic range of half precision, however, is more vulnerable to numerical overflow/underflow. A major focus of this extension is the training of neural networks using 16-bit precision floating point math, which offers significant performance benefits on latest NVIDIA GPU architectures. The reduced dynamic range of half precision, however, is more vulnerable to numerical overflow/underflow.
......
...@@ -8,8 +8,7 @@ ...@@ -8,8 +8,7 @@
// Maximum number of dimensions allowed for cutorch // Maximum number of dimensions allowed for cutorch
#define MAX_CUTORCH_DIMS 10 #define MAX_CUTORCH_DIMS 10
// Warning string for tensor arguments that are too large or have too // Warning string for tensor arguments that are too large or have too many dimensions
// many dimensions
#define CUTORCH_STR(X) #X #define CUTORCH_STR(X) #X
#define CUTORCH_DIM_WARNING "tensor too large or too many (>" \ #define CUTORCH_DIM_WARNING "tensor too large or too many (>" \
CUTORCH_STR(MAX_CUTORCH_DIMS) ") dimensions" CUTORCH_STR(MAX_CUTORCH_DIMS) ") dimensions"
......
...@@ -3,20 +3,12 @@ ...@@ -3,20 +3,12 @@
#include <cstdio> #include <cstdio>
#include <cassert> #include <cassert>
#include <cuda.h> #include <cuda.h>
// this is suboptimal, try forward declarations later
#include <vector> #include <vector>
#define Dims -2 #define Dims -2
#define DEVICE_LINEAR_GET(D_TENSOR, INDEX) D_TENSOR.data[IndexToOffset<T, IndexType, Dims>::get(INDEX, D_TENSOR)] #define DEVICE_LINEAR_GET(D_TENSOR, INDEX) D_TENSOR.data[IndexToOffset<T, IndexType, Dims>::get(INDEX, D_TENSOR)]
#define DEVICE_LINEAR_GET_F(D_TENSOR, INDEX) D_TENSOR.data[IndexToOffset<float, IndexType, Dims>::get(INDEX, D_TENSOR)] #define DEVICE_LINEAR_GET_F(D_TENSOR, INDEX) D_TENSOR.data[IndexToOffset<float, IndexType, Dims>::get(INDEX, D_TENSOR)]
// template <typename T, typename IndexType>
// void send_to_kernel(
// TensorInfo<T, IndexType> Input_1,
// TensorInfo<T, IndexType> Input_2,
// IndexType totalElems
// );
typedef int idxType; typedef int idxType;
struct send_to_fwd_wrapper struct send_to_fwd_wrapper
......
import torch
from torch.autograd import Variable
# import apex
import numpy as np
torch.manual_seed(2)
torch.cuda.manual_seed(2)
# torch.cuda.manual_seed_all(2)
torch.set_printoptions(precision=10)
rows = 3
cols = 20
dims = rows, cols
# Incoming gradient vectors we will use later
# Need to create the fp16 versions as a half() copy of a Tensor first rather than
# a Variable, because if you create pt_input_control as a Variable then say
# pt_input_fp16 = pt_input_control.half(), you are accidentally making pt_input_fp16 part of
# pLpOutput_control's computational graph, so it will not be a leaf!
pt_input_control = Variable(torch.randn(*dims).cuda(), requires_grad=True)
# pt_input_control = torch.ones(*dims).cuda()
pt_input_fp16 = pt_input_control.half()
pt_output_fp16 = pt_input_fp16.sum()
pt_output_control = pt_input_control.sum()
print("After sum()s, before backwards:")
print("pt_output_control.requires_grad = ", pt_output_control.requires_grad)
print("pt_output_control.volatile = ", pt_output_control.volatile)
print("pt_input_control.grad = ", pt_input_control.grad)
print("pt_input_fp16.grad = ", pt_input_fp16.grad)
print("\n\n")
pt_output_fp16.backward() # pt_input_fp16 is not the leaf of this graph, pt_input_control is.
print("After pt_output_fp16.backward():")
print("pt_input_control.grad = ", pt_input_control.grad)
print("pt_input_fp16.grad = ", pt_input_fp16.grad)
print("\n\n")
pt_output_control.backward() # Both backward() calls have pt_input_control as leaves, and so
# will accumulate gradients into pt_input_control.grad
print("After pt_output_control.backward():")
print("pt_input_control.grad = ", pt_input_control.grad)
print("pt_input_fp16.grad = ", pt_input_fp16.grad)
print("\n\n")
print("pt_output_control = ", pt_output_control)
print("pt_output_fp16 = ", pt_output_fp16)
...@@ -32,8 +32,8 @@ RAND = True # If false, input gradients (the result of the backward pass) ...@@ -32,8 +32,8 @@ RAND = True # If false, input gradients (the result of the backward pass)
# where gradients are computed by calling backward() on a scalar Loss. # where gradients are computed by calling backward() on a scalar Loss.
if RAND: if RAND:
# With std=6.0, I observe the pytorch fp16 ops going unstable # With std=6.0, I observe the pytorch fp16 ops going unstable (sometimes)
# while the fused kernel remains stable (sometimes). # while the fused kernel remains stable.
pt_in_fp32 = torch.cuda.FloatTensor(*dims ).normal_(std=1.0) pt_in_fp32 = torch.cuda.FloatTensor(*dims ).normal_(std=1.0)
norm_shape = get_norm_shape(pt_in_fp32, dim) norm_shape = get_norm_shape(pt_in_fp32, dim)
pt_g_fp32 = torch.cuda.FloatTensor(*norm_shape).normal_(std=1.0) pt_g_fp32 = torch.cuda.FloatTensor(*norm_shape).normal_(std=1.0)
......
...@@ -18,8 +18,8 @@ sizes = [ ...@@ -18,8 +18,8 @@ sizes = [
# (3, 1024, 2048), # (3, 1024, 2048),
# (1, 1024, 4096), # (1, 1024, 4096),
# (1, 2048, 8192), # (1, 2048, 8192),
# (1, 4096, 4096), # this is not one of natalia's sizes, just a reference benchmark. # (1, 4096, 4096), # this is not one of the fairseq sizes, just a reference benchmark.
# (4096, 4096, 1), # this is not one of natalia's sizes, just a reference benchmark. # (4096, 4096, 1), # this is not one of the fairseq sizes, just a reference benchmark.
] ]
# rows = 3 # rows = 3
...@@ -37,7 +37,7 @@ for rows, cols, fast in sizes: ...@@ -37,7 +37,7 @@ for rows, cols, fast in sizes:
# pt_input_fp16 = pt_input_control.half(), you are accidentally making pt_input_fp16 part of # pt_input_fp16 = pt_input_control.half(), you are accidentally making pt_input_fp16 part of
# pLpOutput_control's computational graph, instead of the leaf of its own separate graph. # pLpOutput_control's computational graph, instead of the leaf of its own separate graph.
# Careful: if you initialize with torch.ones, the gradient wrt input becomes analytically zero :P # Careful: if you initialize with torch.ones, the gradient wrt input becomes analytically zero.
if RAND: if RAND:
pLpOutput_control = torch.cuda.FloatTensor(*dims ).uniform_()*1.0 pLpOutput_control = torch.cuda.FloatTensor(*dims ).uniform_()*1.0
norm_shape = get_norm_shape(pLpOutput_control, dim) norm_shape = get_norm_shape(pLpOutput_control, dim)
......
...@@ -19,9 +19,9 @@ sizes = [ ...@@ -19,9 +19,9 @@ sizes = [
# (3, 1024, 2048), # (3, 1024, 2048),
# (1, 1024, 4096), # (1, 1024, 4096),
# (1, 2048, 8192), # (1, 2048, 8192),
# (1, 4096, 4096), # this is not one of natalia's sizes, just a reference benchmark. # (1, 4096, 4096), # this is not one of the fairseq sizes, just a reference benchmark.
(4096, 4096, 1), # this is not one of natalia's sizes, just a reference benchmark. (4096, 4096, 1), # this is not one the fairseq sizes, just a reference benchmark.
# (353, 55, 353), # this is not one of natalia's sizes, just a reference benchmark. # (353, 55, 353), # this is not one of the fairseq sizes, just a reference benchmark.
] ]
# rows = 3 # rows = 3
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment