Commit d695b68b authored by Michael Carilli's avatar Michael Carilli
Browse files

Switch to simple Python-only install, in preparation for upstreaming C++ backend.

parent aa817132
import torch
from apex_C import scale_check_overflow
# from apex_C import scale_check_overflow
# Python stopgap, until we get a future-proof kernel into upstream
def scale_check_overflow(d_grads, scale):
# Exception handling for 18.04 compatibility
try:
cpu_sum = float(d_grads.float().sum())
except RuntimeError as instance:
if "value cannot be converted" not in instance.args[0]:
raise
return True
else:
if cpu_sum == float('inf') or cpu_sum == -float('inf') or cpu_sum != cpu_sum:
return True
d_grads.mul_(scale)
return False
class LossScaler(object):
def __init__(self):
......@@ -8,20 +23,23 @@ class LossScaler(object):
self._max_loss_scale = 2.**24
self._scale_seq_len = 2000
self._unskipped = 0
self._overflow_buf = torch.cuda.ByteTensor(1024,)
self._has_overflow = False
# self._overflow_buf = torch.cuda.ByteTensor(1024,)
def loss_scale(self):
return self._loss_scale
def unscale_and_update(self, param_groups, scale):
self._overflow_buf.zero_()
# self._overflow_buf.zero_()
self._has_overflow = False
for p in iter_params(param_groups):
if p.grad is not None:
scale_check_overflow(p.grad.data,
1. / scale,
self._overflow_buf)
self._has_overflow = scale_check_overflow(p.grad.data,
1. / scale)
if self._has_overflow: break
if self._overflow_buf.any():
# if self._overflow_buf.any():
if self._has_overflow:
should_skip = True
self._loss_scale /= 2.
self._unskipped = 0
......
import torch
from torch.autograd import Variable
from torch.autograd.function import Function, once_differentiable
import apex_C
def check_contig_cuda(tensors, names):
for tensor, name in zip(tensors, names):
if not tensor.is_contiguous():
raise RuntimeError(name+" with size {} is not contiguous"
.format(tensor.size()))
if not tensor.is_cuda:
raise RuntimeError(name+".is_cuda = False."
"Currently, only cuda tensors are supported.")
class Fused_Weight_Norm(Function):
"""
Custom autograd function that implements weight norm, as presented in
`<https://arxiv.org/abs/1602.07868>`_,
along a tensor's slowest or
fastest dimension using fused kernel launches for the forward and backward passes.
Accepts fp32 or fp16 input; the output type will match the input type.
Within the kernels, all calculations are performed in fp32 for numerical stability, regardless
of input/output precision.
We are refactoring our fused kernels to add to Pytorch core, so that Pytorch's built-in weightnorm
will use them transparently. Please use Pytorch's built-in weightnorm implementation for now, to
future-proof your code.
"""
@staticmethod
def forward(ctx, input, g, dim=0):
"""
Args:
input(torch.cuda.FloatTensor or torch.cuda.HalfTensor): input tensor corresponding to **v** in the paper. ``input`` should be contiguous.
g(torch.cuda.FloatTensor or torch.cuda.HalfTensor): input tensor corresponding to **g** in the paper. ``g`` should be the same type as ``input``.
dim(int, optional, default=0): Dimension across which to perform weightnorm. Currently, only the first or last dimension of the input tensor is supported.
Returns:
Output tensor corresponding to **w** in the paper. Output type and precision will match
type and precision of ``input``.
"""
# torch.cuda.nvtx.range_push("FusedNorm.forward, input.size() = {}"
# .format(input.size()))
check_contig_cuda((input,g),("input","g"))
"""
This is ok, new() treats a torch.Size object properly.
No need to unpack with an asterisk via new(*input.size()).
"""
output = input.new(input.size()).contiguous()
"""
For output with size (slow, faster, faster, ...fastest), we want
norms with size (slow, 1, 1, ...1), so that if you want retrieve norms
and apply the same normalizing factors to another Tensor "t" with the
same size as output, "t/norms" will broadcast each element of norms
across the corresponding slowest dim of t.
"""
if dim == 0:
norm_size = (output.size(0),) + (1,)*(output.dim() - 1)
elif dim == output.dim() - 1:
norm_size = (1,)*(output.dim() - 1) + (output.size(-1),)
else:
raise RuntimeError("Currently, Fused_Weight_Norm only supports first or last dimension.")
norms = torch.cuda.FloatTensor(*norm_size).contiguous()
"""
Beware: If you call the following:
norms = torch.cuda.FloatTensor(norm_size).contiguous()
the constructor sees a tuple:
FloatTensor( (output_size(0),1,1,...) )
and creates a 1D tensor with values from the tuple:
[output_size(0),1,1,...].
"""
apex_C.weight_norm_fwd(output, norms, input, g, dim)
ctx.save_for_backward(input, g)
# save_for_backward can only save input or output tensors,
# use ctx state to save the norms and dimension:
ctx.norms = norms
ctx.dim = dim
return output
def forward(ctx):
raise NotImplementedError("Use Pytorch's built-in weightnorm implementation. "+
"We are in the process of adding our fused kernels to Pytorch core, "+
"so Pytorch's built-in weightnorm will use them transparently.")
@staticmethod
@once_differentiable
def backward(ctx, grad_output):
"""
Args:
grad_output(torch.cuda.FloatTensor or torch.cuda.HalfTensor): Gradient of loss with respect to output **w**. ``grad_output`` should be contiguous for performance.
Returns:
Gradient of loss with respect to ``input`` and ``g``. The precision of these gradients will match the precision of ``grad_input``.
"""
check_contig_cuda((grad_output), ("grad_output"))
savedInput, savedg = ctx.saved_tensors
savedNorms = ctx.norms
# We expect that these .contiguous() calls will be no-ops. They're present for safety.
grad_output_contig = grad_output.contiguous()
grad_input = grad_output_contig.new(grad_output.size()).contiguous()
grad_g = savedg.new(savedg.size()).contiguous()
apex_C.weight_norm_bwd(grad_input,
grad_g,
grad_output_contig,
savedInput,
savedg,
savedNorms,
ctx.dim)
return grad_input, grad_g, None
raise NotImplementedError("Use Pytorch's built-in weightnorm implementation. "+
"We are in the process of adding our fused kernels to Pytorch core, "+
"so Pytorch's built-in weightnorm will use them transparently.")
#pragma once
#include <ATen/Half.h>
#include <ATen/cuda/CUDAHalf.cuh>
// Type traits to convert types to CUDA-specific types. Used primarily to
// convert at::Half to CUDA's half type. This makes the conversion explicit.
// Disambiguate from whatever is in aten
namespace apex { namespace cuda {
template <typename T>
struct TypeConversion {
using type = T;
};
template <>
struct TypeConversion<at::Half> {
using type = half;
};
template <typename T>
using type = typename TypeConversion<T>::type;
}} // namespace apex::cuda
#include <torch/torch.h>
// Ideally, I'd like to call this file "weight_norm.cu" and put the interface and the implementation
// here, but I can't make nvcc play well with torch.h. For now, use a layer of indirection
// and separate .cu implementation files.
// If we want everything to be part of "apex_C", we need all the interface functions defined
// in this file, or linker will complain about "multiple definitions of PyInit".
// TODO: multiple modules?
// TODO: modify fwd+bwd calls to return a tuple of Tensors. This will require changing the
// Python client code as well. For now, get things working with the same Python-side API.
void weight_norm_fwd_cuda
(const at::Tensor& w,
const at::Tensor& norms,
const at::Tensor& v,
const at::Tensor& g,
int dim);
void weight_norm_fwd
(at::Tensor w,
at::Tensor norms,
at::Tensor v,
at::Tensor g,
int dim)
{
weight_norm_fwd_cuda(w, norms, v, g, dim);
}
void weight_norm_bwd_cuda
(const at::Tensor& pLpv,
const at::Tensor& pLpg,
const at::Tensor& pLpw,
const at::Tensor& savedv,
const at::Tensor& savedg,
const at::Tensor& savedNorms,
int dim);
void weight_norm_bwd
(at::Tensor pLpv,
at::Tensor pLpg,
at::Tensor pLpw,
at::Tensor savedv,
at::Tensor savedg,
at::Tensor savedNorms,
int dim)
{
weight_norm_bwd_cuda(pLpv, pLpg, pLpw, savedv, savedg, savedNorms, dim);
}
void scale_check_overflow_cuda
(const at::Tensor& d_grads,
float scale,
const at::Tensor& d_buf);
#ifdef VERSION_LE_04
#define VERSION_AGNOSTIC_CHECK AT_ASSERT
#else
#define VERSION_AGNOSTIC_CHECK AT_CHECK
#endif
void scale_check_overflow
(at::Tensor grads,
float scale,
at::Tensor overflow_buf)
{
VERSION_AGNOSTIC_CHECK
(grads.type().is_cuda(), "x must be a CUDA tensor");
VERSION_AGNOSTIC_CHECK
(overflow_buf.type().is_cuda(), "y must be a CUDA tensor");
// Make sure we are downscaling the FP32 master grads
VERSION_AGNOSTIC_CHECK
(grads.type().scalarType() == at::ScalarType::Float,
"grads supplied to scale_check_overflow should be fp32 (master grads).")
scale_check_overflow_cuda(grads, scale, overflow_buf);
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("weight_norm_fwd", &weight_norm_fwd, "Fused weight norm, forward pass");
m.def("weight_norm_bwd", &weight_norm_bwd, "Fused weight norm, backward pass");
m.def("scale_check_overflow", &scale_check_overflow, "Fused overflow check + scale for FP32 tensors");
}
#include <cuda.h>
#include <cuda_runtime.h>
// Lock in a local version of CUDATypeConversion.cuh
#include "CUDATypeConversion.cuh"
#include <THC/THCNumerics.cuh>
#if __CUDACC_VER_MAJOR__ >= 9
#define __SHFL_DOWN(var, delta) __shfl_down_sync(0xffffffff, var, delta)
#else
#define __SHFL_DOWN(var, delta) __shfl_down(var, delta)
#endif
#if __CUDACC_VER_MAJOR__ >= 9
#define __SYNCWARP __syncwarp()
#else
#define __SYNCWARP
#endif
// not a long term solution, need to get this code into upstream.
#ifdef VERSION_LE_04
#define USING_ACCSCALAR_T using accscalar_t = cuda::acc_type<cuda_scalar_t>;
#else
#define USING_ACCSCALAR_T using accscalar_t = acc_type<cuda_scalar_t, true>;
#endif
// Block size for weight_norm_*_first_dim_kernel.
// Currently, kernels are non-persistent.
// Dialing up the block size to, say 1024, can improve performance by
// increase the amount of cache available per block, which can improve cache hit rate.
// However, this is less efficient for short rows. 256 is pretty versatile.
// May be worth implementing heuristics later.
#define BLOCK 256
// Block size for weight_norm_*_last_dim_kernel.
// This is tricker than the first_dim case because we must make blocks
// at least 16 fast elements wide to ensure fully-coalesced half-precision accesses.
// Since output-element parallelism is along the fast dimension, this reduces the number of
// blocks we can launch by 16X.
#define TILE_W 16
// Somewhat versatile strategy: max out intra-block parallelism by extending
// blocks across the slow dimension up to the hardware-max block size of 1024.
#define TILE_H 64
// Lock in a local version of ReduceAdd, copied from THCTensorMathReduce.cuh:
template <typename T>
struct ReduceAdd {
inline __device__ T operator()(const T a, const T b) const {
return THCNumerics<T>::add(a, b);
}
};
// lanes is intended to be <= 32.
template
<typename T,
typename ReduceOp>
__device__ __forceinline__ void reduce_block_into_lanes
(T *x,
T val,
int lanes,
ReduceOp reduceOp)
{
int tid = threadIdx.x + threadIdx.y*blockDim.x;
int blockSize = blockDim.x*blockDim.y;
if(blockSize >= 64)
{
x[tid] = val;
__syncthreads();
}
#pragma unroll
for(int i = (blockSize >> 1); i >= 64; i >>= 1)
{
if(tid < i)
x[tid] = reduceOp(x[tid], x[tid+i]);
__syncthreads();
}
if(tid < 32)
{
T final;
if(blockSize >= 64)
final = reduceOp(x[tid], x[tid+32]);
else
final = val;
// __SYNCWARP();
#pragma unroll
for(int i = 16; i >= lanes; i >>= 1)
final = reduceOp(final, __SHFL_DOWN(final, i));
if(tid < lanes)
x[tid] = final; // EpilogueOp
}
// Make sure the smem result is visible to all warps.
__syncthreads();
}
#include <ATen/ATen.h>
// #include "ATen/AccumulateType.h"
#include "ATen/cuda/CUDATensorMethods.cuh"
// #include "ATen/cuda/CUDATypeConversion.cuh"
// #include <THC/THCTensorMathReduce.cuh>
#include <THC/THCGeneral.h>
#include <assert.h>
#define BLOCK_SIZE 1024
#define MAX_BLOCKS 1024
// It makes sense to lock the type to "float" here because the downscaling
// should only be applied to the FP32 master gradients. Also, if "in" were
// a different type, it would require divergent code for the vectorized load logic.
// TODO:
// Update overflow check to use reduction from kernel_utils.cuh with
// ReduceOp from THCTensorMathReduce.cuh.
__global__ void scale_reduce_overflow
(float *in,
size_t n,
float scale,
uint8_t *overflow_out)
{
__shared__ uint8_t cta_overflow[BLOCK_SIZE];
int tid = blockIdx.x * blockDim.x + threadIdx.x;
int stride = gridDim.x * blockDim.x;
uint8_t my_overflow = 0;
for (int i = tid * 4; i < n; i+= stride * 4) {
if (i < (n - 3)) {
float4 f4 = ((float4*)in)[i / 4];
if (isfinite(f4.x)) {
f4.x *= scale;
} else {
my_overflow = 1;
}
if (isfinite(f4.y)) {
f4.y *= scale;
} else {
my_overflow = 1;
}
if (isfinite(f4.z)) {
f4.z *= scale;
} else {
my_overflow = 1;
}
if (isfinite(f4.w)) {
f4.w *= scale;
} else {
my_overflow = 1;
}
((float4*)in)[i / 4] = f4;
} else {
for (; i < n; ++i) {
if (isfinite(in[i])) {
in[i] *= scale;
} else {
my_overflow = 1;
}
}
}
}
int tIdx = threadIdx.x;
cta_overflow[tIdx] = my_overflow;
__syncthreads();
int participating = BLOCK_SIZE / 2;
while (participating > 0) {
if (tIdx < participating) {
cta_overflow[tIdx] = max(cta_overflow[tIdx],
cta_overflow[tIdx + participating]);
}
participating /= 2;
__syncthreads();
}
if (tIdx == 0) {
overflow_out[blockIdx.x] = max(cta_overflow[0],
overflow_out[blockIdx.x]);
}
}
void scale_check_overflow_cuda
(const at::Tensor& d_grads,
float scale,
const at::Tensor& d_buf)
{
using namespace at;
cudaStream_t stream = globalContext().getCurrentCUDAStream();
size_t n = d_grads.numel();
size_t buf_n = d_buf.numel();
int num_blks = min((int(n) + BLOCK_SIZE - 1) / BLOCK_SIZE,
MAX_BLOCKS);
assert(buf_n >= num_blks);
scale_reduce_overflow<<<num_blks, BLOCK_SIZE, 0, stream>>>
(d_grads.data<float>(),
n,
scale,
d_buf.data<uint8_t>());
THCudaCheck(cudaGetLastError());
}
#include "kernel_utils.cuh"
#include <ATen/ATen.h>
#ifdef VERSION_LE_04
#include "ATen/cuda/AccumulateType.cuh"
#else
#include "ATen/AccumulateType.h"
#endif
#include "ATen/cuda/CUDATensorMethods.cuh"
// #include "ATen/cuda/CUDATypeConversion.cuh"
// #include <THC/THCTensorMathReduce.cuh>
template
<typename scalar_t,
typename accscalar_t>
__global__ void weight_norm_bwd_first_dim_kernel
(scalar_t* __restrict__ pLpv,
scalar_t* __restrict__ pLpg,
const scalar_t* __restrict__ pLpw,
const scalar_t* __restrict__ savedv,
const scalar_t* __restrict__ savedg,
const accscalar_t* __restrict__ savedNorms,
const int rowSize)
{
// For now, assign one block to each row.
const int tid = threadIdx.x;
const int row = blockIdx.x;
const int stride = blockDim.x;
// Logical index offset for this flattened row
const int rowStart = row*rowSize;
// Hack to get around nvcc complaining when an smem array is declared with the same name
// but different types in different kernels (in this case different instantiations)
// extern __shared__ accscalar_t s[]; // error: declaration is incompatible with previous "s"
extern __shared__ char buf[];
accscalar_t* s = (accscalar_t*)buf;
accscalar_t thread_sum = 0.f;
for(int i = tid; i < rowSize; i += stride )
{
accscalar_t pLpwi = scalar_cast<accscalar_t>(pLpw[i+rowStart]);
accscalar_t savedvi = scalar_cast<accscalar_t>(savedv[i+rowStart]);
thread_sum += pLpwi*savedvi; // AccumOp, could do Kahan here
}
reduce_block_into_lanes(s, thread_sum, 1, ReduceAdd<accscalar_t>());
accscalar_t result = s[0];
// Could choose to save reciprocal of norm instead I suppose, but norms is probably
// more handy to keep around.
// Broadcast load; could use shared memory instead.
accscalar_t rnorm = 1.f/savedNorms[row];
accscalar_t rnorm3 = rnorm*rnorm*rnorm;
// Write g gradients.
if(tid == 0)
pLpg[row] = scalar_cast<scalar_t>(result*rnorm);
// Broadcast load, could use shared memory instead.
accscalar_t g_this_row = scalar_cast<accscalar_t>(savedg[row]);
// Write v gradients. We are reusing values that were loaded earlier, so there
// is an optimization opportunity here (store values persistently).
for(int j = tid; j < rowSize; j += stride )
{
accscalar_t pLpwj = scalar_cast<accscalar_t>(pLpw[j+rowStart]);
accscalar_t savedvj = scalar_cast<accscalar_t>(savedv[j+rowStart]);
accscalar_t pLpvj = g_this_row*(rnorm*pLpwj - rnorm3*savedvj*result);
pLpv[j+rowStart] = scalar_cast<scalar_t>(pLpvj);
}
}
template
<typename scalar_t,
typename accscalar_t>
__global__ void weight_norm_bwd_last_dim_kernel
(scalar_t* __restrict__ pLpv,
scalar_t* __restrict__ pLpg,
const scalar_t* __restrict__ pLpw,
const scalar_t* __restrict__ savedv,
const scalar_t* __restrict__ savedg,
const accscalar_t* __restrict__ savedNorms,
const int fast_dim_size,
const int slower_dims_size)
{
const int fast_dim_location = threadIdx.x + blockIdx.x*blockDim.x;
extern __shared__ char buf[];
accscalar_t* s = (accscalar_t*)buf;
accscalar_t thread_sum = 0.f;
int slower_dims_location = threadIdx.y;
int currentIdx = fast_dim_location + fast_dim_size*slower_dims_location;
if(fast_dim_location < fast_dim_size)
while(slower_dims_location < slower_dims_size)
{
accscalar_t pLpwi = scalar_cast<accscalar_t>(pLpw[currentIdx]);
accscalar_t savedvi = scalar_cast<accscalar_t>(savedv[currentIdx]);
thread_sum += pLpwi*savedvi; // AccumOp, could do Kahan here
currentIdx += blockDim.y*fast_dim_size;
slower_dims_location += blockDim.y;
}
reduce_block_into_lanes(s, thread_sum, blockDim.x, ReduceAdd<accscalar_t>());
accscalar_t result = s[threadIdx.x];
// Broadcast load; could use shared memory instead.
accscalar_t rnorm = 1.f/savedNorms[fast_dim_location];
accscalar_t rnorm3 = rnorm*rnorm*rnorm;
// Write g gradients.
if(threadIdx.y == 0)
pLpg[fast_dim_location] = scalar_cast<scalar_t>(result*rnorm);
// Entire block pulls these values, could use shared memory instead.
accscalar_t g_this_col = scalar_cast<accscalar_t>(savedg[fast_dim_location]);
// Write v gradients.
slower_dims_location = threadIdx.y;
currentIdx = fast_dim_location + fast_dim_size*slower_dims_location;
if(fast_dim_location < fast_dim_size)
while(slower_dims_location < slower_dims_size)
{
accscalar_t pLpwj = scalar_cast<accscalar_t>(pLpw[currentIdx]);
accscalar_t savedvj = scalar_cast<accscalar_t>(savedv[currentIdx]);
accscalar_t pLpvj = g_this_col*(rnorm*pLpwj - rnorm3*savedvj*result);
pLpv[currentIdx] = scalar_cast<scalar_t>(pLpvj);
currentIdx += blockDim.y*fast_dim_size;
slower_dims_location += blockDim.y;
}
}
void weight_norm_bwd_cuda
(const at::Tensor& pLpv,
const at::Tensor& pLpg,
const at::Tensor& pLpw,
const at::Tensor& savedv,
const at::Tensor& savedg,
const at::Tensor& savedNorms,
int dim)
{
#ifdef DEBUG_ANY
using namespace std;
cout << "Hello from send_to_bwd with pLpw.type() = " << pLpw.type() << endl;
#endif
const int ndims = savedv.ndimension();
if(dim == 0)
{
// Find logical size of each flattened slowest-dim row
int rowSize = 1;
for(int i = ndims - 1; i > 0; i--)
rowSize *= savedv.size(i);
using namespace at;
cudaStream_t stream = globalContext().getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES_AND_HALF
(savedv.type(),
"weight_norm_bwd_first_dim_kernel",
[&]
{
using cuda_scalar_t = apex::cuda::type<scalar_t>;
USING_ACCSCALAR_T
weight_norm_bwd_first_dim_kernel
<<<pLpw.size(0),
BLOCK,
BLOCK*sizeof(accscalar_t),
stream>>>
(pLpv.data<cuda_scalar_t>(),
pLpg.data<cuda_scalar_t>(),
pLpw.data<cuda_scalar_t>(),
savedv.data<cuda_scalar_t>(),
savedg.data<cuda_scalar_t>(),
savedNorms.data<accscalar_t>(),
rowSize);
});
}
else if(dim == ndims - 1)
{
// Precompute slower_dims_size and fast_dim_size because they involve dynamically indexing an array.
int slower_dims_size = 1;
for(int i = 0; i < ndims - 1; i++)
slower_dims_size *= savedv.size(i);
int fast_dim_size = savedv.size(ndims-1);
using namespace at;
cudaStream_t stream = globalContext().getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES_AND_HALF
(savedv.type(),
"weight_norm_bwd_last_dim_kernel",
[&]
{
using cuda_scalar_t = apex::cuda::type<scalar_t>;
USING_ACCSCALAR_T
weight_norm_bwd_last_dim_kernel
<<<(fast_dim_size+TILE_W-1)/TILE_W,
dim3(TILE_W,TILE_H),
(TILE_W*TILE_H + TILE_W)*sizeof(accscalar_t),
stream>>>
(pLpv.data<cuda_scalar_t>(),
pLpg.data<cuda_scalar_t>(),
pLpw.data<cuda_scalar_t>(),
savedv.data<cuda_scalar_t>(),
savedg.data<cuda_scalar_t>(),
savedNorms.data<accscalar_t>(),
fast_dim_size,
slower_dims_size);
});
}
// else
// {
// intermediate dim kernel. Error checking on the dim was already done in
// Module.cpp:weight_norm_bwd. Could put that logic here instead, if we include
// <python.h> in both files.
// }
// The kernel execution is asynchronous, so this will only catch errors on the kernel launch,
// not the kernel's execution. Errors in kernel execution aren't guaranteed to be caught
// until a later error check on a synchronizing CUDA call. Unfortunately, without manually
// synchronizing here, this is the best we can do.
THCudaCheck(cudaGetLastError());
#ifdef DEBUG_PROFILE
THCudaCheck(cudaDeviceSynchronize());
#endif
}
#include "kernel_utils.cuh"
#include <ATen/ATen.h>
#ifdef VERSION_LE_04
#include "ATen/cuda/AccumulateType.cuh"
#else
#include "ATen/AccumulateType.h"
#endif
#include "ATen/cuda/CUDATensorMethods.cuh"
// #include "ATen/cuda/CUDATypeConversion.cuh"
// #include <THC/THCTensorMathReduce.cuh>
template
<typename scalar_t,
typename accscalar_t>
__global__ void weight_norm_fwd_first_dim_kernel
(scalar_t* __restrict__ w,
accscalar_t* __restrict__ norms,
const scalar_t* __restrict__ v,
const scalar_t* __restrict__ g,
const int rowSize)
{
// We are norming each slowest-dim row of the tensor separately.
// For now, assign one block to each row.
const int tid = threadIdx.x;
const int row = blockIdx.x;
const int stride = blockDim.x;
// Logical index offset for this flattened row
const int rowStart = row*rowSize;
// Hack to get around nvcc complaining when an smem array is declared with the same name
// but different types in different kernels (in this case different instantiations)
// extern __shared__ accscalar_t s[]; // error: declaration is incompatible with previous "s"
extern __shared__ char buf[];
accscalar_t* s = (accscalar_t*)buf;
accscalar_t thread_sum = 0.f;
for(int i = tid; i < rowSize; i += stride )
{
accscalar_t val_f = scalar_cast<accscalar_t>(v[i+rowStart]);
thread_sum += val_f*val_f; // AccumOp, could do Kahan here
}
reduce_block_into_lanes(s, thread_sum, 1, ReduceAdd<accscalar_t>());
accscalar_t result = s[0];
result = sqrtf(result);
if(tid == 0)
norms[row] = result;
// Broadcast load, could use shared memory instead.
accscalar_t g_this_row = scalar_cast<accscalar_t>(g[row]);
accscalar_t rnorm = 1.f/result; // for consistency with backward kernel
// Write data to output
for(int i = tid; i < rowSize; i += stride )
{
accscalar_t val_f = scalar_cast<accscalar_t>(v[i+rowStart]);
w[i+rowStart] = scalar_cast<scalar_t>(g_this_row*val_f*rnorm);
}
}
template
<typename scalar_t,
typename accscalar_t>
__global__ void weight_norm_fwd_last_dim_kernel
(
scalar_t* __restrict__ w,
accscalar_t* __restrict__ norms,
const scalar_t* __restrict__ v,
const scalar_t* __restrict__ g,
const int fast_dim_size,
const int slower_dims_size
)
{
const int fast_dim_location = threadIdx.x + blockIdx.x*blockDim.x;
extern __shared__ char buf[];
accscalar_t* alloc = (accscalar_t*)buf;
accscalar_t* s = &alloc[0];
accscalar_t* rnorms_this_block = &alloc[blockDim.x*blockDim.y];
accscalar_t thread_sum = 0.f;
int slower_dims_location = threadIdx.y;
int currentIdx = fast_dim_location + fast_dim_size*slower_dims_location;
if(fast_dim_location < fast_dim_size)
while(slower_dims_location < slower_dims_size)
{
accscalar_t val_f = scalar_cast<accscalar_t>(v[currentIdx]);
thread_sum += val_f*val_f; // AccumOp, could do Kahan here
currentIdx += blockDim.y*fast_dim_size;
slower_dims_location += blockDim.y;
}
reduce_block_into_lanes(s, thread_sum, blockDim.x, ReduceAdd<accscalar_t>());
// Better to pass an EpilogueOp to reduce_block_into_lanes, implement later
if(threadIdx.y == 0)
{
accscalar_t result = s[threadIdx.x];
accscalar_t norm_this_col = sqrtf(result);
norms[fast_dim_location] = norm_this_col;
rnorms_this_block[threadIdx.x] = 1.f/norm_this_col;
}
__syncthreads();
accscalar_t g_this_col = scalar_cast<accscalar_t>(g[fast_dim_location]);
accscalar_t rnorm = rnorms_this_block[threadIdx.x];
slower_dims_location = threadIdx.y;
currentIdx = fast_dim_location + fast_dim_size*slower_dims_location;
if(fast_dim_location < fast_dim_size)
while(slower_dims_location < slower_dims_size)
{
accscalar_t val_f = scalar_cast<accscalar_t>(v[currentIdx]);
w[currentIdx] = scalar_cast<scalar_t>(g_this_col*val_f*rnorm);
currentIdx += blockDim.y*fast_dim_size;
slower_dims_location += blockDim.y;
}
}
void weight_norm_fwd_cuda
(const at::Tensor& w,
const at::Tensor& norms,
const at::Tensor& v,
const at::Tensor& g,
int dim)
{
#ifdef DEBUG_ANY
using namespace std;
cout << "hello from send_to_fwd with v.type() = " << v.type() << endl;
#endif
const int ndims = v.ndimension();
if(dim == 0)
{
// Find logical size of each flattened slowest-dim row
int rowSize = 1;
for(int i = ndims - 1; i > 0; i--)
rowSize *= v.size(i);
using namespace at;
cudaStream_t stream = globalContext().getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES_AND_HALF
(v.type(),
"weight_norm_fwd_first_dim_kernel",
[&]
{
using cuda_scalar_t = apex::cuda::type<scalar_t>;
USING_ACCSCALAR_T
weight_norm_fwd_first_dim_kernel
<<<v.size(0),
BLOCK,
BLOCK*sizeof(accscalar_t),
stream>>>
(w.data<cuda_scalar_t>(),
norms.data<accscalar_t>(),
v.data<cuda_scalar_t>(),
g.data<cuda_scalar_t>(),
rowSize);
});
}
else if(dim == ndims - 1)
{
// Precompute slower_dims_size and fast_dim_size
int slower_dims_size = 1;
for(int i = 0; i < ndims - 1; i++)
slower_dims_size *= v.size(i);
int fast_dim_size = v.size(ndims-1);
using namespace at;
cudaStream_t stream = globalContext().getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES_AND_HALF
(v.type(),
"weight_norm_fwd_last_dim_kernel",
[&]
{
using cuda_scalar_t = apex::cuda::type<scalar_t>;
USING_ACCSCALAR_T
// just trying this formatting out to see how it feels...
weight_norm_fwd_last_dim_kernel
<<<(fast_dim_size+TILE_W-1)/TILE_W,
dim3(TILE_W,TILE_H),
(TILE_W*TILE_H + TILE_W)*sizeof(accscalar_t),
stream>>>
(w.data<cuda_scalar_t>(),
norms.data<accscalar_t>(),
v.data<cuda_scalar_t>(),
g.data<cuda_scalar_t>(),
fast_dim_size,
slower_dims_size);
});
}
// else
// {
// intermediate dim kernel. Error checking on the dim was already done in
// Module.cpp:weight_norm_fwd. Could put that logic here instead, if we include
// <python.h> in both files.
// }
// The kernel execution is asynchronous, so this will only catch errors on the kernel launch,
// not the kernel's execution. Errors in kernel execution aren't guaranteed to be caught
// until a later error check on a synchronizing CUDA call. Unfortunately, without manually
// synchronizing here, this is the best we can do.
THCudaCheck(cudaGetLastError());
#ifdef DEBUG_PROFILE
THCudaCheck(cudaDeviceSynchronize());
#endif
}
import torch.cuda
import ctypes.util
import os
import re
import subprocess
import torch
from setuptools import setup, find_packages
from distutils.command.clean import clean
from torch.utils.cpp_extension import CUDAExtension, CUDA_HOME
# TODO: multiple modules, so we don't have to route all interfaces through
# the same interface.cpp file?
if not torch.cuda.is_available():
print("Warning: Torch did not find available GPUs on this system.\n",
......@@ -22,111 +13,6 @@ if TORCH_MAJOR == 0 and TORCH_MINOR < 4:
raise RuntimeError("APEx requires Pytorch 0.4 or newer.\n" +
"The latest stable release can be obtained from https://pytorch.org/")
version_le_04 = []
if TORCH_MAJOR == 0 and TORCH_MINOR == 4:
version_le_04 = ['-DVERSION_LE_04']
def find(path, regex_func, collect=False):
collection = [] if collect else None
for root, dirs, files in os.walk(path):
for file in files:
if regex_func(file):
if collect:
collection.append(os.path.join(root, file))
else:
return os.path.join(root, file)
return list(set(collection))
# Due to https://github.com/pytorch/pytorch/issues/8223, for Pytorch <= 0.4
# torch.utils.cpp_extension's check for CUDA_HOME fails if there are no GPUs
# available on the system, which prevents cross-compiling and building via Dockerfiles.
# Workaround: manually search for CUDA_HOME if Pytorch <= 0.4.
def find_cuda_home():
cuda_path = None
CUDA_HOME = None
CUDA_HOME = os.getenv('CUDA_HOME', '/usr/local/cuda')
if not os.path.exists(CUDA_HOME):
# We use nvcc path on Linux and cudart path on macOS
cudart_path = ctypes.util.find_library('cudart')
if cudart_path is not None:
cuda_path = os.path.dirname(cudart_path)
if cuda_path is not None:
CUDA_HOME = os.path.dirname(cuda_path)
if not cuda_path and not CUDA_HOME:
nvcc_path = find('/usr/local/', re.compile("nvcc").search, False)
if nvcc_path:
CUDA_HOME = os.path.dirname(nvcc_path)
if CUDA_HOME:
os.path.dirname(CUDA_HOME)
if (not os.path.exists(CUDA_HOME+os.sep+"lib64")
or not os.path.exists(CUDA_HOME+os.sep+"include") ):
raise RuntimeError("Error: found NVCC at ",
nvcc_path,
" but could not locate CUDA libraries"+
" or include directories.")
raise RuntimeError("Error: Could not find cuda on this system. " +
"Please set your CUDA_HOME environment variable "
"to the CUDA base directory.")
return CUDA_HOME
if TORCH_MAJOR == 0 and TORCH_MINOR == 4:
if CUDA_HOME is None:
CUDA_HOME = find_cuda_home()
# Patch cpp_extension's view of CUDA_HOME:
torch.utils.cpp_extension.CUDA_HOME = CUDA_HOME
def get_cuda_version():
NVCC = find(CUDA_HOME+os.sep+"bin",
re.compile('nvcc$|nvcc.exe').search)
print("Found NVCC = ", NVCC)
# Parse output of nvcc to get cuda major version
nvcc_output = subprocess.check_output([NVCC, '--version']).decode("utf-8")
CUDA_LIB = re.compile(', V[0-9]+\.[0-9]+\.[0-9]+').search(nvcc_output).group(0).split('V')[1]
print("Found CUDA_LIB = ", CUDA_LIB)
CUDA_MAJOR = int(CUDA_LIB.split('.')[0])
print("Found CUDA_MAJOR = ", CUDA_MAJOR)
if CUDA_MAJOR < 8:
raise RuntimeError("APex requires CUDA 8.0 or newer")
return CUDA_MAJOR
if CUDA_HOME is not None:
print("Found CUDA_HOME = ", CUDA_HOME)
CUDA_MAJOR = get_cuda_version()
gencodes = ['-gencode', 'arch=compute_50,code=sm_50',
'-gencode', 'arch=compute_52,code=sm_52',
'-gencode', 'arch=compute_60,code=sm_60',
'-gencode', 'arch=compute_61,code=sm_61',]
if CUDA_MAJOR > 8:
gencodes += ['-gencode', 'arch=compute_70,code=sm_70',
'-gencode', 'arch=compute_70,code=compute_70',]
ext_modules = []
extension = CUDAExtension(
'apex_C', [
'csrc/interface.cpp',
'csrc/weight_norm_fwd_cuda.cu',
'csrc/weight_norm_bwd_cuda.cu',
'csrc/scale_cuda.cu',
],
extra_compile_args={'cxx': ['-g'] + version_le_04,
'nvcc': ['-O3'] + version_le_04 + gencodes})
ext_modules.append(extension)
else:
raise RuntimeError("Could not find Cuda install directory")
setup(
name='apex',
version='0.1',
......@@ -139,7 +25,5 @@ setup(
'tests',
'examples',
'apex.egg-info',)),
ext_modules=ext_modules,
description='PyTorch Extensions written by NVIDIA',
cmdclass={'build_ext': torch.utils.cpp_extension.BuildExtension},
)
import torch
import numpy as np
def compare(cuda_out, pt_out, pt_out_control, rows):
# print( "Pytorch ops in fp16: ", pt_out )
# print( "Kernel result: ", cuda_out )
# print("Control (Pytorch ops, sticking to fp32): ", pt_out_control)
# Make upconverted copies for error check against fp32 control
cuda_out_fp32 = cuda_out.float()
pt_out_fp32 = pt_out.float()
# Flatten all but the slowest dimension
cuda_out = cuda_out.view(rows,-1)
pt_out = pt_out.view(rows,-1)
cuda_out_fp32 = cuda_out_fp32.view(rows,-1)
pt_out_fp32 = pt_out_fp32.view(rows,-1)
pt_out_control = pt_out_control.view(rows,-1)
cuda_maxdiffs, cuda_maxdiff_locs = torch.max((pt_out_control - cuda_out_fp32).abs(),1)
pt_maxdiffs, pt_maxdiff_locs = torch.max((pt_out_control - pt_out_fp32 ).abs(),1)
print( "cuda_maxdiffs = ", cuda_maxdiffs )
# print("cuda_maxdiff_locs = ", cuda_maxdiff_locs)
print( "pt_maxdiffs = ", pt_maxdiffs )
# print( "pt_maxdiff_locs = ", pt_maxdiff_locs )
row_indices = torch.LongTensor(np.arange(rows))
# print("cuda_out at cuda_maxdiff_locs in each row:")
# # bizarrely, this will work if you do it at the python prompt:
# # print(cuda_out[row_indices,cuda_maxdiff_locs])
# # ...but it only seems to work here if you wrap with numpy arrays:
# print( cuda_out[np.array(row_indices),np.array(cuda_maxdiff_locs)])
# print("pt_out_control at cuda_maxdiff_locs in each row:")
# print(pt_out_control[np.array(row_indices),np.array(cuda_maxdiff_locs)])
#
# print("pt_out at pt_maxdiff_locs in each row:" )
# print( pt_out[np.array(row_indices),np.array(pt_maxdiff_locs)])
# print("pt_out_control at pt_maxdiff_locs in each row:" )
# print(pt_out_control[np.array(row_indices),np.array(pt_maxdiff_locs)])
import torch
def get_norm_shape(p, dim):
if dim == 0:
output_size = (p.size(0),) + (1,) * (p.dim() - 1)
return output_size
elif dim == p.dim() - 1:
output_size = (1,) * (p.dim() - 1) + (p.size(-1),)
return output_size
return None
def pt_norm(p, dim):
"""Computes the norm over all dimensions except dim"""
if dim is None:
return p.norm()
elif dim == 0:
return p.contiguous().view(p.size(0), -1).norm(2,dim=1).view(*get_norm_shape(p, dim))
elif dim == p.dim() - 1:
return p.contiguous().view(-1, p.size(-1)).norm(2,dim=0).view(*get_norm_shape(p, dim))
return pt_norm(p.transpose(0, dim), 0).transpose(0, dim)
import torch
from torch.autograd import Variable
from apex.fp16_utils import Fused_Weight_Norm
from compare import compare
from norm import pt_norm, get_norm_shape
torch.manual_seed(2)
torch.cuda.manual_seed(2)
# torch.cuda.manual_seed_all(2)
torch.set_printoptions(precision=10)
rows = 321 # 1
cols = 33 # 4096
fast = 185 # 4096
dims = rows, cols, fast
dim = 0
CUDA_HALF = True
RAND = True # If false, input gradients (the result of the backward pass)
# should be analytically zero.
# Loss will be computed via (output*elementwise).sum().
# This means that output gradients in the backward pass will be equal
# to elementwise, so by manipulating elementwise, we have easy
# fine-grained control over the output gradients we'd like to use for
# testing purposes.
#
# The alternative is just to create the output_gradients manually
# and call output.backward(gradient=output_gradients),
# as is done in test_backward.py.
# But I wanted a minimal working sample similar to an "actual" use case,
# where gradients are computed by calling backward() on a scalar Loss.
if RAND:
# With std=6.0, I observe the pytorch fp16 ops going unstable (sometimes)
# while the fused kernel remains stable.
pt_in_fp32 = torch.cuda.FloatTensor(*dims ).normal_(std=1.0)
norm_shape = get_norm_shape(pt_in_fp32, dim)
pt_g_fp32 = torch.cuda.FloatTensor(*norm_shape).normal_(std=1.0)
elementwise_fp32 = torch.cuda.FloatTensor(*dims ).normal_(std=1.0)
else:
pt_in_fp32 = torch.cuda.FloatTensor(*dims ).fill_(1.0)
norm_shape = get_norm_shape(pt_in_fp32, dim)
pt_g_fp32 = torch.cuda.FloatTensor(*norm_shape).fill_(2.0)
elementwise_fp32 = torch.cuda.FloatTensor(*dims ).fill_(0.5)
pt_in_fp16 = pt_in_fp32.half()
cd_in_prec = pt_in_fp32.clone()
pt_g_fp16 = pt_g_fp32.half()
cd_g_prec = pt_g_fp32.clone()
elementwise_fp16 = elementwise_fp32.half()
elementwise_prec = elementwise_fp32.clone()
if CUDA_HALF:
cd_in_prec = cd_in_prec.half()
cd_g_prec = cd_g_prec.half()
elementwise_prec = elementwise_prec.half()
pt_in_fp32 = Variable(pt_in_fp32 , requires_grad=True)
pt_in_fp16 = Variable(pt_in_fp16 , requires_grad=True)
cd_in_prec = Variable(cd_in_prec , requires_grad=True)
pt_g_fp32 = Variable(pt_g_fp32 , requires_grad=True)
pt_g_fp16 = Variable(pt_g_fp16 , requires_grad=True)
cd_g_prec = Variable(cd_g_prec , requires_grad=True)
elementwise_fp32 = Variable(elementwise_fp32, requires_grad=False)
elementwise_fp16 = Variable(elementwise_fp16, requires_grad=False)
elementwise_prec = Variable(elementwise_prec, requires_grad=False)
torch.cuda.nvtx.range_push("fp16 forward, {}".format(pt_in_fp16.size()))
pt_norms_fp16 = pt_norm(pt_in_fp16, dim)
pt_out_fp16 = pt_in_fp16*(pt_g_fp16/pt_norms_fp16)
torch.cuda.nvtx.range_pop()
# torch.cuda.synchronize()
torch.cuda.nvtx.range_push("fp32 forward, {}".format(pt_in_fp32.size()))
pt_norms_fp32 = pt_norm(pt_in_fp32, dim)
pt_out_fp32 = pt_in_fp32*(pt_g_fp32/pt_norms_fp32)
torch.cuda.nvtx.range_pop()
# torch.cuda.synchronize()
# print("pt_norms_fp16 = ", pt_norms_fp16 )
# print("pt_norms_fp32 = ", pt_norms_fp32)
# print( "cd_in_prec.data_ptr = {:x}".format(cd_in_prec.data_ptr()))
# print("elementwise_fp16 = ", elementwise_fp16)
cd_in_contig = cd_in_prec.contiguous()
# Deliberately make noncontig to see if fused_norm
# will handle the error
# cd_in_contig = cd_in_contig[:,0:5]
# print(type(cd_in_contig))
torch.cuda.nvtx.range_push("kernel forward")
fused_weight_norm = Fused_Weight_Norm.apply
cd_out_prec = fused_weight_norm(cd_in_contig, cd_g_prec, dim)
torch.cuda.nvtx.range_pop()
# torch.cuda.synchronize()
# print("type(cd_out_prec.data) = ", type(cd_out_prec.data))
# print("cd_out_prec.data_ptr = {:x}".format(cd_out_prec.data_ptr()))
print("\n\n\nCOMPARING FORWARD PASS RESULTS\n\n\n")
compare(cd_out_prec.data,
pt_out_fp16.data,
pt_out_fp32.data,
rows)
# It's ok to use elementwise_fp16 as a leaf in both the cuda and pytorch graphs.
# This sharing should not affect the computed gradients wrt pt_in_fp16 and cd_in_prec.
# However, just remember:
# If we set requires_grad=True for elementwise_fp16, elementwise_fp16.grad.data
# will accumulate gradients during the backward passes for both the cd and pytorch Losses.
#
# I do need v these parentheses v
Loss_cd_prec = (cd_out_prec*elementwise_prec).sum()
# print(L_cd_fp16)
Loss_pt_fp16 = (pt_out_fp16*elementwise_fp16).sum()
# print(L_pt_fp16)
Loss_pt_fp32 = (pt_out_fp32*elementwise_fp32).sum()
# print(L_pt_fp32)
torch.cuda.nvtx.range_push("kernel backward")
Loss_cd_prec.backward()
torch.cuda.nvtx.range_pop()
torch.cuda.nvtx.range_push("fp16 backward")
Loss_pt_fp16.backward()
torch.cuda.nvtx.range_pop()
torch.cuda.nvtx.range_push("fp32 backward")
Loss_pt_fp32.backward()
torch.cuda.nvtx.range_pop()
print("\n\n\nCOMPARING v GRADIENT RESULTS\n\n\n")
compare(cd_in_prec.grad.data,
pt_in_fp16.grad.data,
pt_in_fp32.grad.data,
rows)
print("\n\n\nCOMPARING g GRADIENT RESULTS\n\n\n")
compare(cd_g_prec.grad.data,
pt_g_fp16.grad.data,
pt_g_fp32.grad.data,
cd_g_prec.size(0))
import torch
from torch.autograd import Variable
import apex_C
import numpy as np
from compare import compare
from norm import pt_norm, get_norm_shape
torch.manual_seed(2)
torch.cuda.manual_seed(2)
# torch.cuda.manual_seed_all(2)
torch.set_printoptions(precision=10)
sizes = [
# (3, 512, 1024),
# (3, 512, 1536),
(3, 768, 1536),
# (3, 768, 2048),
# (3, 1024, 2048),
# (1, 1024, 4096),
# (1, 2048, 8192),
# (1, 4096, 4096), # this is not one of the fairseq sizes, just a reference benchmark.
# (4096, 4096, 1), # this is not one of the fairseq sizes, just a reference benchmark.
]
# rows = 3
# cols = 512
# fast = 1024
HALF = True
RAND = True
dim = 2
for rows, cols, fast in sizes:
dims = rows, cols, fast
# Incoming gradient vectors we will use later
# Need to create the fp16 versions as a half() copy of a Tensor first rather than
# a Variable, because if you create pt_input_control as a Variable then say
# pt_input_fp16 = pt_input_control.half(), you are accidentally making pt_input_fp16 part of
# pLpOutput_control's computational graph, instead of the leaf of its own separate graph.
# Careful: if you initialize with torch.ones, the gradient wrt input becomes analytically zero.
if RAND:
pLpOutput_control = torch.cuda.FloatTensor(*dims ).uniform_()*1.0
norm_shape = get_norm_shape(pLpOutput_control, dim)
pLpg_control = torch.cuda.FloatTensor(*norm_shape).uniform_()
pt_input_control = torch.cuda.FloatTensor(*dims ).uniform_()
pt_g_control = torch.cuda.FloatTensor(*norm_shape).uniform_()
else:
pLpOutput_control = torch.cuda.FloatTensor(*dims ).fill_(1.)
norm_shape = get_norm_shape(pLpOutput_control, dim)
pLpg_control = torch.cuda.FloatTensor(*norm_shape).fill_(2.)
pt_input_control = torch.cuda.FloatTensor(*dims ).fill_(4.0)
pt_g_control = torch.cuda.FloatTensor(*norm_shape).fill_(3.0)
pLpOutput_fp16 = pLpOutput_control.clone()
pLpg_fp16 = pLpg_control .clone()
pt_input_fp16 = pt_input_control .clone()
pt_g_fp16 = pt_g_control .clone()
if HALF:
pLpOutput_fp16 = pLpOutput_fp16.half()
pLpg_fp16 = pLpg_fp16 .half()
pt_input_fp16 = pt_input_fp16 .half()
pt_g_fp16 = pt_g_fp16 .half()
pLpOutput_control = Variable(pLpOutput_control)
pLpg_control = Variable(pLpg_control )
pLpOutput_fp16 = Variable(pLpOutput_fp16 )
pLpg_fp16 = Variable(pLpg_fp16 )
pt_input_control = Variable(pt_input_control, requires_grad=True)
pt_g_control = Variable(pt_g_control , requires_grad=True)
pt_input_fp16 = Variable(pt_input_fp16 , requires_grad=True)
pt_g_fp16 = Variable(pt_g_fp16 , requires_grad=True)
# Do forward pass in fp16 and fp32
pt_norms_fp16 = pt_norm(pt_input_fp16, dim)
pt_norms_control = pt_norm(pt_input_control, dim)
pt_output_fp16 = pt_input_fp16 *(pt_g_fp16 /pt_norms_fp16 )
pt_output_control = pt_input_control*(pt_g_control/pt_norms_control)
# Run the Cuda version
pLpInput_cuda = torch.cuda.FloatTensor(*dims ).fill_(0.)
pLpg_cuda = torch.cuda.FloatTensor(*norm_shape).fill_(0.)
if HALF:
pLpInput_cuda = pLpInput_cuda.half()
pLpg_cuda = pLpg_cuda .half()
torch.cuda.nvtx.range_push("kernel weight norm backward")
apex_C.weight_norm_bwd(pLpInput_cuda,
pLpg_cuda,
pLpOutput_fp16,
pt_input_fp16,
pt_g_fp16,
pt_norms_control.data,
dim)
torch.cuda.nvtx.range_pop()
print("grad_output: ", pLpOutput_fp16.data)
print(" grad_input: ", pLpInput_cuda)
print(" savedInput: ", pt_input_fp16.data)
print("pt_norms_control: ", pt_norms_control.data)
print("pt_norms_fp16: ", pt_norms_fp16.data)
torch.cuda.nvtx.range_push("pytorch fp16 backward")
pt_output_fp16 .backward(gradient=pLpOutput_fp16 , create_graph=True)
torch.cuda.nvtx.range_pop()
torch.cuda.nvtx.range_push("pytorch fp32 backward")
pt_output_control.backward(gradient=pLpOutput_control, create_graph=True)
torch.cuda.nvtx.range_pop()
# pt_output_fp16 and pt_output_control are still saved, but
# pt_output_fp16.grad and pt_output_control.grad are None at this point
# because the graph is freed in the backwards pass.
# Specifying create_/retain_ graph don't seem to force saving of
# either the intermediate variables or their gradients.
print("Comparing gradients wrt v")
torch.cuda.nvtx.range_push("compare pLpv")
compare(pLpInput_cuda, pt_input_fp16.grad.data, pt_input_control.grad.data, rows)
torch.cuda.nvtx.range_pop()
print("Comparing gradients wrt g")
torch.cuda.nvtx.range_push("compare pLpg")
compare(pLpg_cuda, pt_g_fp16.grad.data, pt_g_control.grad.data, pLpg_cuda.size(0))
torch.cuda.nvtx.range_pop()
import torch
import sys
import apex_C
import numpy as np
from compare import compare
from norm import pt_norm, get_norm_shape
torch.manual_seed(2)
torch.cuda.manual_seed(2)
# torch.cuda.manual_seed_all(2)
torch.set_printoptions(precision=10)
sizes = [
# (3, 512, 1024),
# (3, 512, 1536),
# (3, 768, 1536),
# (3, 768, 2048),
# (3, 1024, 2048),
# (1, 1024, 4096),
# (1, 2048, 8192),
# (1, 4096, 4096), # this is not one of the fairseq sizes, just a reference benchmark.
(4096, 4096, 1), # this is not one the fairseq sizes, just a reference benchmark.
# (353, 55, 353), # this is not one of the fairseq sizes, just a reference benchmark.
]
# rows = 3
# cols = 512
# fast = 1024
HALF = True
RAND = True
dim = 0
for rows, cols, fast in sizes:
dims = rows, cols, fast
print("\n\nTESTING dims = {}\n\n".format(dims))
if RAND:
pt_in = 1.*torch.cuda.FloatTensor(*dims).uniform_()
g = torch.cuda.FloatTensor(*get_norm_shape(pt_in, dim)).uniform_()
else:
pt_in = torch.cuda.FloatTensor(*dims).fill_(1.)
g = torch.cuda.FloatTensor(*get_norm_shape(pt_in, dim)).fill_(6.0)
# per_col = torch.arange(1,cols+1).cuda()
# print((rows*per_col*per_col).sqrt())
# pt_in *= per_col
cuda_out = torch.cuda.FloatTensor(*dims).fill_(0.)
cuda_norms = torch.cuda.FloatTensor(*get_norm_shape(pt_in, dim)).fill_(0.)
# Save a copy of the input as float
pt_in_fp32 = pt_in.clone()
g_fp32 = g.clone()
if HALF:
pt_in = pt_in.half()
g = g.half()
cuda_out = cuda_out.half()
apex_C.weight_norm_fwd(cuda_out, cuda_norms, pt_in, g, dim)
torch.cuda.synchronize()
# quit()
print("type(cuda_out) = {}\n".format(type(cuda_out)))
rownorms = pt_norm(pt_in, dim)
rownorms_fp32 = pt_norm(pt_in_fp32, dim)
print("rownorms_fp32:")
print(rownorms_fp32)
print("cuda_norms" )
print(cuda_norms )
# rownorms is broadcast; torch.div(pt_in, rownorms) and pt_in/rownorms work the same way
pt_out = pt_in*(g/rownorms)
pt_out_control = pt_in_fp32*(g_fp32/rownorms_fp32)
compare(cuda_out, pt_out, pt_out_control, rows)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment