Commit 2fa4dbaf authored by Christian Sarofeen's avatar Christian Sarofeen
Browse files

Initial release

parents
import torch
import numpy as np
def compare(cuda_out, pt_out, pt_out_control, rows):
print( "Pytorch ops in fp16: ", pt_out )
print( "Kernel result: ", cuda_out )
print("Control (Pytorch ops, sticking to fp32): ", pt_out_control)
# Make upconverted copies for error check against fp32 control
cuda_out_fp32 = cuda_out.float()
pt_out_fp32 = pt_out.float()
# Flatten all but the slowest dimension
cuda_out = cuda_out.view(rows,-1)
pt_out = pt_out.view(rows,-1)
cuda_out_fp32 = cuda_out_fp32.view(rows,-1)
pt_out_fp32 = pt_out_fp32.view(rows,-1)
pt_out_control = pt_out_control.view(rows,-1)
cuda_maxdiffs, cuda_maxdiff_locs = torch.max((pt_out_control - cuda_out_fp32).abs(),1)
pt_maxdiffs, pt_maxdiff_locs = torch.max((pt_out_control - pt_out_fp32 ).abs(),1)
print( "cuda_maxdiffs = ", cuda_maxdiffs )
print("cuda_maxdiff_locs = ", cuda_maxdiff_locs)
print( "pt_maxdiffs = ", pt_maxdiffs )
print( "pt_maxdiff_locs = ", pt_maxdiff_locs )
row_indices = torch.LongTensor(np.arange(rows))
print("cuda_out at cuda_maxdiff_locs in each row:")
# bizarrely, this will work if you do it at the python prompt:
# print(cuda_out[row_indices,cuda_maxdiff_locs])
# ...but it only seems to work here if you wrap with numpy arrays:
print( cuda_out[np.array(row_indices),np.array(cuda_maxdiff_locs)])
print("pt_out_control at cuda_maxdiff_locs in each row:")
print(pt_out_control[np.array(row_indices),np.array(cuda_maxdiff_locs)])
print("pt_out at pt_maxdiff_locs in each row:" )
print( pt_out[np.array(row_indices),np.array(pt_maxdiff_locs)])
print("pt_out_control at pt_maxdiff_locs in each row:" )
print(pt_out_control[np.array(row_indices),np.array(pt_maxdiff_locs)])
import torch
def get_norm_shape(p, dim):
if dim == 0:
output_size = (p.size(0),) + (1,) * (p.dim() - 1)
return output_size
elif dim == p.dim() - 1:
output_size = (1,) * (p.dim() - 1) + (p.size(-1),)
return output_size
return None
def pt_norm(p, dim):
"""Computes the norm over all dimensions except dim"""
if dim is None:
return p.norm()
elif dim == 0:
return p.contiguous().view(p.size(0), -1).norm(2,dim=1).view(*get_norm_shape(p, dim))
elif dim == p.dim() - 1:
return p.contiguous().view(-1, p.size(-1)).norm(2,dim=0).view(*get_norm_shape(p, dim))
return pt_norm(p.transpose(0, dim), 0).transpose(0, dim)
import torch
from torch.autograd import Variable
from apex.fp16_utils import Fused_Weight_Norm
from compare import compare
from norm import pt_norm, get_norm_shape
torch.manual_seed(2)
torch.cuda.manual_seed(2)
# torch.cuda.manual_seed_all(2)
torch.set_printoptions(precision=10)
rows = 1 # 321
cols = 4096 # 33
fast = 4096 # 185
dims = rows, cols, fast
dim = 2
CUDA_HALF = False
RAND = True # If false, input gradients (the result of the backward pass)
# should be analytically zero.
# Loss will be computed via (output*elementwise).sum().
# This means that output gradients in the backward pass will be equal
# to elementwise, so by manipulating elementwise, we have easy
# fine-grained control over the output gradients we'd like to use for
# testing purposes.
#
# The alternative is just to create the output_gradients manually
# and call output.backward(gradient=output_gradients),
# as is done in test_backward.py.
# But I wanted a minimal working sample similar to an "actual" use case,
# where gradients are computed by calling backward() on a scalar Loss.
if RAND:
# With std=6.0, I observe the pytorch fp16 ops going unstable
# while the fused kernel remains stable (sometimes).
pt_in_fp32 = torch.cuda.FloatTensor(*dims ).normal_(std=1.0)
norm_shape = get_norm_shape(pt_in_fp32, dim)
pt_g_fp32 = torch.cuda.FloatTensor(*norm_shape).normal_(std=1.0)
elementwise_fp32 = torch.cuda.FloatTensor(*dims ).normal_(std=1.0)
else:
pt_in_fp32 = torch.cuda.FloatTensor(*dims ).fill_(1.0)
norm_shape = get_norm_shape(pt_in_fp32, dim)
pt_g_fp32 = torch.cuda.FloatTensor(*norm_shape).fill_(2.0)
elementwise_fp32 = torch.cuda.FloatTensor(*dims ).fill_(0.5)
pt_in_fp16 = pt_in_fp32.half()
cd_in_prec = pt_in_fp32.clone()
pt_g_fp16 = pt_g_fp32.half()
cd_g_prec = pt_g_fp32.clone()
elementwise_fp16 = elementwise_fp32.half()
elementwise_prec = elementwise_fp32.clone()
if CUDA_HALF:
cd_in_prec = cd_in_prec.half()
cd_g_prec = cd_g_prec.half()
elementwise_prec = elementwise_prec.half()
pt_in_fp32 = Variable(pt_in_fp32 , requires_grad=True)
pt_in_fp16 = Variable(pt_in_fp16 , requires_grad=True)
cd_in_prec = Variable(cd_in_prec , requires_grad=True)
pt_g_fp32 = Variable(pt_g_fp32 , requires_grad=True)
pt_g_fp16 = Variable(pt_g_fp16 , requires_grad=True)
cd_g_prec = Variable(cd_g_prec , requires_grad=True)
elementwise_fp32 = Variable(elementwise_fp32, requires_grad=False)
elementwise_fp16 = Variable(elementwise_fp16, requires_grad=False)
elementwise_prec = Variable(elementwise_prec, requires_grad=False)
torch.cuda.nvtx.range_push("fp16 forward, {}".format(pt_in_fp16.size()))
pt_norms_fp16 = pt_norm(pt_in_fp16, dim)
pt_out_fp16 = pt_in_fp16*(pt_g_fp16/pt_norms_fp16)
torch.cuda.nvtx.range_pop()
# torch.cuda.synchronize()
torch.cuda.nvtx.range_push("fp32 forward, {}".format(pt_in_fp32.size()))
pt_norms_fp32 = pt_norm(pt_in_fp32, dim)
pt_out_fp32 = pt_in_fp32*(pt_g_fp32/pt_norms_fp32)
torch.cuda.nvtx.range_pop()
# torch.cuda.synchronize()
# print("pt_norms_fp16 = ", pt_norms_fp16 )
# print("pt_norms_fp32 = ", pt_norms_fp32)
# print( "cd_in_prec.data_ptr = {:x}".format(cd_in_prec.data_ptr()))
# print("elementwise_fp16 = ", elementwise_fp16)
cd_in_contig = cd_in_prec.contiguous()
# Deliberately make noncontig to see if fused_norm
# will handle the error
# cd_in_contig = cd_in_contig[:,0:5]
# print(type(cd_in_contig))
torch.cuda.nvtx.range_push("kernel forward")
fused_weight_norm = Fused_Weight_Norm.apply
cd_out_prec = fused_weight_norm(cd_in_contig, cd_g_prec, dim)
torch.cuda.nvtx.range_pop()
# torch.cuda.synchronize()
# print("type(cd_out_prec.data) = ", type(cd_out_prec.data))
# print("cd_out_prec.data_ptr = {:x}".format(cd_out_prec.data_ptr()))
print("\n\n\nCOMPARING FORWARD PASS RESULTS\n\n\n")
compare(cd_out_prec.data,
pt_out_fp16.data,
pt_out_fp32.data,
rows)
# It's ok to use elementwise_fp16 as a leaf in both the cuda and pytorch graphs.
# This sharing should not affect the computed gradients wrt pt_in_fp16 and cd_in_prec.
# However, just remember:
# If we set requires_grad=True for elementwise_fp16, elementwise_fp16.grad.data
# will accumulate gradients during the backward passes for both the cd and pytorch Losses.
#
# I do need v these parentheses v
Loss_cd_prec = (cd_out_prec*elementwise_prec).sum()
# print(L_cd_fp16)
Loss_pt_fp16 = (pt_out_fp16*elementwise_fp16).sum()
# print(L_pt_fp16)
Loss_pt_fp32 = (pt_out_fp32*elementwise_fp32).sum()
# print(L_pt_fp32)
torch.cuda.nvtx.range_push("kernel backward")
Loss_cd_prec.backward()
torch.cuda.nvtx.range_pop()
torch.cuda.nvtx.range_push("fp16 backward")
Loss_pt_fp16.backward()
torch.cuda.nvtx.range_pop()
torch.cuda.nvtx.range_push("fp32 backward")
Loss_pt_fp32.backward()
torch.cuda.nvtx.range_pop()
print("\n\n\nCOMPARING v GRADIENT RESULTS\n\n\n")
compare(cd_in_prec.grad.data,
pt_in_fp16.grad.data,
pt_in_fp32.grad.data,
rows)
print("\n\n\nCOMPARING g GRADIENT RESULTS\n\n\n")
compare(cd_g_prec.grad.data,
pt_g_fp16.grad.data,
pt_g_fp32.grad.data,
cd_g_prec.size(0))
import torch
from torch.autograd import Variable
import apex._C
import numpy as np
from compare import compare
from norm import pt_norm, get_norm_shape
torch.manual_seed(2)
torch.cuda.manual_seed(2)
# torch.cuda.manual_seed_all(2)
torch.set_printoptions(precision=10)
sizes = [
# (3, 512, 1024),
# (3, 512, 1536),
(3, 768, 1536),
# (3, 768, 2048),
# (3, 1024, 2048),
# (1, 1024, 4096),
# (1, 2048, 8192),
# (1, 4096, 4096), # this is not one of natalia's sizes, just a reference benchmark.
# (4096, 4096, 1), # this is not one of natalia's sizes, just a reference benchmark.
]
# rows = 3
# cols = 512
# fast = 1024
HALF = True
RAND = True
dim = 2
for rows, cols, fast in sizes:
dims = rows, cols, fast
# Incoming gradient vectors we will use later
# Need to create the fp16 versions as a half() copy of a Tensor first rather than
# a Variable, because if you create pt_input_control as a Variable then say
# pt_input_fp16 = pt_input_control.half(), you are accidentally making pt_input_fp16 part of
# pLpOutput_control's computational graph, instead of the leaf of its own separate graph.
# Careful: if you initialize with torch.ones, the gradient wrt input becomes analytically zero :P
if RAND:
pLpOutput_control = torch.cuda.FloatTensor(*dims ).uniform_()*1.0
norm_shape = get_norm_shape(pLpOutput_control, dim)
pLpg_control = torch.cuda.FloatTensor(*norm_shape).uniform_()
pt_input_control = torch.cuda.FloatTensor(*dims ).uniform_()
pt_g_control = torch.cuda.FloatTensor(*norm_shape).uniform_()
else:
pLpOutput_control = torch.cuda.FloatTensor(*dims ).fill_(1.)
norm_shape = get_norm_shape(pLpOutput_control, dim)
pLpg_control = torch.cuda.FloatTensor(*norm_shape).fill_(2.)
pt_input_control = torch.cuda.FloatTensor(*dims ).fill_(4.0)
pt_g_control = torch.cuda.FloatTensor(*norm_shape).fill_(3.0)
pLpOutput_fp16 = pLpOutput_control.clone()
pLpg_fp16 = pLpg_control .clone()
pt_input_fp16 = pt_input_control .clone()
pt_g_fp16 = pt_g_control .clone()
if HALF:
pLpOutput_fp16 = pLpOutput_fp16.half()
pLpg_fp16 = pLpg_fp16 .half()
pt_input_fp16 = pt_input_fp16 .half()
pt_g_fp16 = pt_g_fp16 .half()
pLpOutput_control = Variable(pLpOutput_control)
pLpg_control = Variable(pLpg_control )
pLpOutput_fp16 = Variable(pLpOutput_fp16 )
pLpg_fp16 = Variable(pLpg_fp16 )
pt_input_control = Variable(pt_input_control, requires_grad=True)
pt_g_control = Variable(pt_g_control , requires_grad=True)
pt_input_fp16 = Variable(pt_input_fp16 , requires_grad=True)
pt_g_fp16 = Variable(pt_g_fp16 , requires_grad=True)
# Do forward pass in fp16 and fp32
pt_norms_fp16 = pt_norm(pt_input_fp16, dim)
pt_norms_control = pt_norm(pt_input_control, dim)
pt_output_fp16 = pt_input_fp16 *(pt_g_fp16 /pt_norms_fp16 )
pt_output_control = pt_input_control*(pt_g_control/pt_norms_control)
# Run the Cuda version
pLpInput_cuda = torch.cuda.FloatTensor(*dims ).fill_(0.)
pLpg_cuda = torch.cuda.FloatTensor(*norm_shape).fill_(0.)
if HALF:
pLpInput_cuda = pLpInput_cuda.half()
pLpg_cuda = pLpg_cuda .half()
torch.cuda.nvtx.range_push("kernel weight norm backward")
apex._C.weight_norm_bwd(pLpInput_cuda,
pLpg_cuda,
pLpOutput_fp16,
pt_input_fp16,
pt_g_fp16,
pt_norms_control.data,
dim)
torch.cuda.nvtx.range_pop()
print("grad_output: ", pLpOutput_fp16.data)
print(" grad_input: ", pLpInput_cuda)
print(" savedInput: ", pt_input_fp16.data)
print("pt_norms_control: ", pt_norms_control.data)
print("pt_norms_fp16: ", pt_norms_fp16.data)
torch.cuda.nvtx.range_push("pytorch fp16 backward")
pt_output_fp16 .backward(gradient=pLpOutput_fp16 , create_graph=True)
torch.cuda.nvtx.range_pop()
torch.cuda.nvtx.range_push("pytorch fp32 backward")
pt_output_control.backward(gradient=pLpOutput_control, create_graph=True)
torch.cuda.nvtx.range_pop()
# pt_output_fp16 and pt_output_control are still saved, but
# pt_output_fp16.grad and pt_output_control.grad are None at this point
# because the graph is freed in the backwards pass.
# Specifying create_/retain_ graph don't seem to force saving of
# either the intermediate variables or their gradients.
print("Comparing gradients wrt v")
torch.cuda.nvtx.range_push("compare pLpv")
compare(pLpInput_cuda, pt_input_fp16.grad.data, pt_input_control.grad.data, rows)
torch.cuda.nvtx.range_pop()
print("Comparing gradients wrt g")
torch.cuda.nvtx.range_push("compare pLpg")
compare(pLpg_cuda, pt_g_fp16.grad.data, pt_g_control.grad.data, pLpg_cuda.size(0))
torch.cuda.nvtx.range_pop()
import torch
import sys
import apex._C
import numpy as np
from compare import compare
from norm import pt_norm, get_norm_shape
torch.manual_seed(2)
torch.cuda.manual_seed(2)
# torch.cuda.manual_seed_all(2)
torch.set_printoptions(precision=10)
sizes = [
# (3, 512, 1024),
# (3, 512, 1536),
# (3, 768, 1536),
# (3, 768, 2048),
# (3, 1024, 2048),
# (1, 1024, 4096),
# (1, 2048, 8192),
# (1, 4096, 4096), # this is not one of natalia's sizes, just a reference benchmark.
(4096, 4096, 1), # this is not one of natalia's sizes, just a reference benchmark.
# (353, 55, 353), # this is not one of natalia's sizes, just a reference benchmark.
]
# rows = 3
# cols = 512
# fast = 1024
HALF = True
RAND = True
dim = 0
for rows, cols, fast in sizes:
dims = rows, cols, fast
print("\n\nTESTING dims = {}\n\n".format(dims))
if RAND:
pt_in = 1.*torch.cuda.FloatTensor(*dims).uniform_()
g = torch.cuda.FloatTensor(*get_norm_shape(pt_in, dim)).uniform_()
else:
pt_in = torch.cuda.FloatTensor(*dims).fill_(1.)
g = torch.cuda.FloatTensor(*get_norm_shape(pt_in, dim)).fill_(6.0)
# per_col = torch.arange(1,cols+1).cuda()
# print((rows*per_col*per_col).sqrt())
# pt_in *= per_col
cuda_out = torch.cuda.FloatTensor(*dims).fill_(0.)
cuda_norms = torch.cuda.FloatTensor(*get_norm_shape(pt_in, dim)).fill_(0.)
# Save a copy of the input as float
pt_in_fp32 = pt_in.clone()
g_fp32 = g.clone()
if HALF:
pt_in = pt_in.half()
g = g.half()
cuda_out = cuda_out.half()
apex._C.weight_norm_fwd(cuda_out, cuda_norms, pt_in, g, dim)
torch.cuda.synchronize()
# quit()
print("type(cuda_out) = {}\n".format(type(cuda_out)))
rownorms = pt_norm(pt_in, dim)
rownorms_fp32 = pt_norm(pt_in_fp32, dim)
print("rownorms_fp32:")
print(rownorms_fp32)
print("cuda_norms" )
print(cuda_norms )
# rownorms is broadcast; torch.div(pt_in, rownorms) and pt_in/rownorms work the same way
pt_out = pt_in*(g/rownorms)
pt_out_control = pt_in_fp32*(g_fp32/rownorms_fp32)
compare(cuda_out, pt_out, pt_out_control, rows)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment