Initial release

2fa4dbaf · Christian Sarofeen · 2fa4dbaf · 2fa4dbaf · 2fa4dbaf · 2fa4dbaf
Commit 2fa4dbaf authored Apr 25, 2018 by Christian Sarofeen
5 changed files
--- a/tests/raw_ops/compare.py
+++ b/tests/raw_ops/compare.py
+import torch
+import numpy as np
+def compare(cuda_out, pt_out, pt_out_control, rows):
+    print(                    "Pytorch ops in fp16:  ", pt_out        )
+    print(                          "Kernel result:  ", cuda_out      )
+    print("Control (Pytorch ops, sticking to fp32):  ", pt_out_control)
+    # Make upconverted copies for error check against fp32 control
+    cuda_out_fp32 = cuda_out.float()
+    pt_out_fp32 = pt_out.float()
+    # Flatten all but the slowest dimension
+    cuda_out       =       cuda_out.view(rows,-1)
+    pt_out         =         pt_out.view(rows,-1)
+    cuda_out_fp32  =  cuda_out_fp32.view(rows,-1)
+    pt_out_fp32    =    pt_out_fp32.view(rows,-1)
+    pt_out_control = pt_out_control.view(rows,-1)
+    cuda_maxdiffs, cuda_maxdiff_locs = torch.max((pt_out_control - cuda_out_fp32).abs(),1)
+    pt_maxdiffs, pt_maxdiff_locs     = torch.max((pt_out_control - pt_out_fp32  ).abs(),1)
+    print(    "cuda_maxdiffs = ", cuda_maxdiffs    )
+    print("cuda_maxdiff_locs = ", cuda_maxdiff_locs)
+    print(      "pt_maxdiffs = ", pt_maxdiffs      )
+    print(  "pt_maxdiff_locs = ", pt_maxdiff_locs  )
+    row_indices = torch.LongTensor(np.arange(rows))
+    print("cuda_out at cuda_maxdiff_locs in each row:")
+    # bizarrely, this will work if you do it at the python prompt:
+    # print(cuda_out[row_indices,cuda_maxdiff_locs])
+    # ...but it only seems to work here if you wrap with numpy arrays:
+    print(      cuda_out[np.array(row_indices),np.array(cuda_maxdiff_locs)])
+    print("pt_out_control at cuda_maxdiff_locs in each row:")
+    print(pt_out_control[np.array(row_indices),np.array(cuda_maxdiff_locs)])
+    print("pt_out at pt_maxdiff_locs in each row:"          )
+    print(        pt_out[np.array(row_indices),np.array(pt_maxdiff_locs)])
+    print("pt_out_control at pt_maxdiff_locs in each row:"  )
+    print(pt_out_control[np.array(row_indices),np.array(pt_maxdiff_locs)])
--- a/tests/raw_ops/norm.py
+++ b/tests/raw_ops/norm.py
+import torch
+def get_norm_shape(p, dim):
+    if dim == 0:
+        output_size = (p.size(0),) + (1,) * (p.dim() - 1)
+        return output_size
+    elif dim == p.dim() - 1:
+        output_size = (1,) * (p.dim() - 1) + (p.size(-1),)
+        return output_size
+    return None
+def pt_norm(p, dim):
+    """Computes the norm over all dimensions except dim"""
+    if dim is None:
+        return p.norm()
+    elif dim == 0:
+        return p.contiguous().view(p.size(0), -1).norm(2,dim=1).view(*get_norm_shape(p, dim))
+    elif dim == p.dim() - 1:
+        return p.contiguous().view(-1, p.size(-1)).norm(2,dim=0).view(*get_norm_shape(p, dim))
+    return pt_norm(p.transpose(0, dim), 0).transpose(0, dim)
--- a/tests/raw_ops/test_autograd.py
+++ b/tests/raw_ops/test_autograd.py
+import torch
+from torch.autograd import Variable
+from apex.fp16_utils import Fused_Weight_Norm
+from compare import compare
+from norm import pt_norm, get_norm_shape
+torch.manual_seed(2)
+torch.cuda.manual_seed(2)
+# torch.cuda.manual_seed_all(2)
+torch.set_printoptions(precision=10)
+rows = 1    # 321
+cols = 4096 # 33
+fast = 4096 # 185
+dims = rows, cols, fast
+dim = 2
+CUDA_HALF = False
+RAND      = True # If false, input gradients (the result of the backward pass) 
+                 # should be analytically zero.
+# Loss will be computed via (output*elementwise).sum().
+# This means that output gradients in the backward pass will be equal
+# to elementwise, so by manipulating elementwise, we have easy 
+# fine-grained control over the output gradients we'd like to use for
+# testing purposes.
+# 
+# The alternative is just to create the output_gradients manually 
+# and call output.backward(gradient=output_gradients), 
+# as is done in test_backward.py.
+# But I wanted a minimal working sample similar to an "actual" use case, 
+# where gradients are computed by calling backward() on a scalar Loss.
+if RAND:
+    # With std=6.0, I observe the pytorch fp16 ops going unstable
+    # while the fused kernel remains stable (sometimes).
+    pt_in_fp32       = torch.cuda.FloatTensor(*dims      ).normal_(std=1.0)
+    norm_shape = get_norm_shape(pt_in_fp32, dim)
+    pt_g_fp32        = torch.cuda.FloatTensor(*norm_shape).normal_(std=1.0)
+    elementwise_fp32 = torch.cuda.FloatTensor(*dims      ).normal_(std=1.0)
+else:
+    pt_in_fp32       = torch.cuda.FloatTensor(*dims      ).fill_(1.0)
+    norm_shape = get_norm_shape(pt_in_fp32, dim)
+    pt_g_fp32        = torch.cuda.FloatTensor(*norm_shape).fill_(2.0)
+    elementwise_fp32 = torch.cuda.FloatTensor(*dims      ).fill_(0.5)
+pt_in_fp16       = pt_in_fp32.half()
+cd_in_prec       = pt_in_fp32.clone()
+pt_g_fp16        = pt_g_fp32.half()
+cd_g_prec        = pt_g_fp32.clone()
+elementwise_fp16 = elementwise_fp32.half()
+elementwise_prec = elementwise_fp32.clone()
+if CUDA_HALF:
+    cd_in_prec       = cd_in_prec.half()
+    cd_g_prec        = cd_g_prec.half()
+    elementwise_prec = elementwise_prec.half()
+pt_in_fp32 = Variable(pt_in_fp32 , requires_grad=True)
+pt_in_fp16 = Variable(pt_in_fp16 , requires_grad=True)
+cd_in_prec = Variable(cd_in_prec , requires_grad=True)
+pt_g_fp32 = Variable(pt_g_fp32 , requires_grad=True)
+pt_g_fp16 = Variable(pt_g_fp16 , requires_grad=True)
+cd_g_prec = Variable(cd_g_prec , requires_grad=True)
+elementwise_fp32 = Variable(elementwise_fp32, requires_grad=False)
+elementwise_fp16 = Variable(elementwise_fp16, requires_grad=False)
+elementwise_prec = Variable(elementwise_prec, requires_grad=False)
+torch.cuda.nvtx.range_push("fp16 forward, {}".format(pt_in_fp16.size()))
+pt_norms_fp16 = pt_norm(pt_in_fp16, dim)
+pt_out_fp16 = pt_in_fp16*(pt_g_fp16/pt_norms_fp16) 
+torch.cuda.nvtx.range_pop()
+# torch.cuda.synchronize()
+torch.cuda.nvtx.range_push("fp32 forward, {}".format(pt_in_fp32.size()))
+pt_norms_fp32 = pt_norm(pt_in_fp32, dim)
+pt_out_fp32 = pt_in_fp32*(pt_g_fp32/pt_norms_fp32)
+torch.cuda.nvtx.range_pop()
+# torch.cuda.synchronize()
+# print("pt_norms_fp16    = ", pt_norms_fp16   )
+# print("pt_norms_fp32 = ", pt_norms_fp32)
+# print( "cd_in_prec.data_ptr = {:x}".format(cd_in_prec.data_ptr()))
+# print("elementwise_fp16 = ", elementwise_fp16)
+cd_in_contig = cd_in_prec.contiguous()
+# Deliberately make noncontig to see if fused_norm
+# will handle the error
+# cd_in_contig = cd_in_contig[:,0:5]
+# print(type(cd_in_contig))
+torch.cuda.nvtx.range_push("kernel forward")
+fused_weight_norm = Fused_Weight_Norm.apply
+cd_out_prec = fused_weight_norm(cd_in_contig, cd_g_prec, dim)
+torch.cuda.nvtx.range_pop()
+# torch.cuda.synchronize()
+# print("type(cd_out_prec.data) = ", type(cd_out_prec.data))
+# print("cd_out_prec.data_ptr = {:x}".format(cd_out_prec.data_ptr()))
+print("\n\n\nCOMPARING FORWARD PASS RESULTS\n\n\n")
+compare(cd_out_prec.data, 
+        pt_out_fp16.data,
+        pt_out_fp32.data,
+        rows)
+# It's ok to use elementwise_fp16 as a leaf in both the cuda and pytorch graphs.
+# This sharing should not affect the computed gradients wrt pt_in_fp16 and cd_in_prec.
+# However, just remember:  
+# If we set requires_grad=True for elementwise_fp16, elementwise_fp16.grad.data
+# will accumulate gradients during the backward passes for both the cd and pytorch Losses.
+#
+# I do need    v these parentheses          v             
+Loss_cd_prec = (cd_out_prec*elementwise_prec).sum()
+# print(L_cd_fp16)
+Loss_pt_fp16 = (pt_out_fp16*elementwise_fp16).sum()
+# print(L_pt_fp16)
+Loss_pt_fp32 = (pt_out_fp32*elementwise_fp32).sum()
+# print(L_pt_fp32)
+torch.cuda.nvtx.range_push("kernel backward")
+Loss_cd_prec.backward()
+torch.cuda.nvtx.range_pop()
+torch.cuda.nvtx.range_push("fp16 backward")
+Loss_pt_fp16.backward()
+torch.cuda.nvtx.range_pop()
+torch.cuda.nvtx.range_push("fp32 backward")
+Loss_pt_fp32.backward()
+torch.cuda.nvtx.range_pop()
+print("\n\n\nCOMPARING v GRADIENT RESULTS\n\n\n")
+compare(cd_in_prec.grad.data, 
+        pt_in_fp16.grad.data, 
+        pt_in_fp32.grad.data, 
+        rows)
+print("\n\n\nCOMPARING g GRADIENT RESULTS\n\n\n")
+compare(cd_g_prec.grad.data, 
+        pt_g_fp16.grad.data, 
+        pt_g_fp32.grad.data, 
+        cd_g_prec.size(0))
--- a/tests/raw_ops/test_backward.py
+++ b/tests/raw_ops/test_backward.py
+import torch
+from torch.autograd import Variable
+import apex._C
+import numpy as np
+from compare import compare
+from norm import pt_norm, get_norm_shape
+torch.manual_seed(2)
+torch.cuda.manual_seed(2)
+# torch.cuda.manual_seed_all(2)
+torch.set_printoptions(precision=10)
+sizes = [
+    # (3,  512, 1024),
+    # (3,  512, 1536),
+    (3,  768, 1536),
+    # (3,  768, 2048),
+    # (3, 1024, 2048),
+    # (1, 1024, 4096),
+    # (1, 2048, 8192),
+    # (1, 4096, 4096), # this is not one of natalia's sizes, just a reference benchmark.
+    # (4096, 4096, 1), # this is not one of natalia's sizes, just a reference benchmark.
+    ]
+# rows = 3
+# cols = 512
+# fast = 1024
+HALF = True
+RAND = True
+dim = 2
+for rows, cols, fast in sizes:
+    dims = rows, cols, fast
+    # Incoming gradient vectors we will use later
+    # Need to create the fp16 versions as a half() copy of a Tensor first rather than
+    # a Variable, because if you create pt_input_control as a Variable then say
+    # pt_input_fp16 = pt_input_control.half(), you are accidentally making pt_input_fp16 part of 
+    # pLpOutput_control's computational graph, instead of the leaf of its own separate graph.
+    # Careful: if you initialize with torch.ones, the gradient wrt input becomes analytically zero :P
+    if RAND:
+        pLpOutput_control = torch.cuda.FloatTensor(*dims      ).uniform_()*1.0 
+        norm_shape = get_norm_shape(pLpOutput_control, dim)
+        pLpg_control      = torch.cuda.FloatTensor(*norm_shape).uniform_()
+        pt_input_control  = torch.cuda.FloatTensor(*dims      ).uniform_()
+        pt_g_control      = torch.cuda.FloatTensor(*norm_shape).uniform_()
+    else:
+        pLpOutput_control = torch.cuda.FloatTensor(*dims      ).fill_(1.)
+        norm_shape = get_norm_shape(pLpOutput_control, dim)
+        pLpg_control      = torch.cuda.FloatTensor(*norm_shape).fill_(2.)
+        pt_input_control  = torch.cuda.FloatTensor(*dims      ).fill_(4.0)
+        pt_g_control      = torch.cuda.FloatTensor(*norm_shape).fill_(3.0)
+    pLpOutput_fp16 = pLpOutput_control.clone()
+    pLpg_fp16      = pLpg_control     .clone()
+    pt_input_fp16  = pt_input_control .clone()
+    pt_g_fp16      = pt_g_control     .clone()
+    if HALF:
+        pLpOutput_fp16 = pLpOutput_fp16.half()
+        pLpg_fp16      = pLpg_fp16     .half() 
+        pt_input_fp16  = pt_input_fp16 .half()
+        pt_g_fp16      = pt_g_fp16     .half()
+    pLpOutput_control = Variable(pLpOutput_control)
+    pLpg_control      = Variable(pLpg_control     )
+    pLpOutput_fp16    = Variable(pLpOutput_fp16   )
+    pLpg_fp16         = Variable(pLpg_fp16        )
+    pt_input_control = Variable(pt_input_control, requires_grad=True)
+    pt_g_control     = Variable(pt_g_control    , requires_grad=True)
+    pt_input_fp16    = Variable(pt_input_fp16   , requires_grad=True)
+    pt_g_fp16        = Variable(pt_g_fp16       , requires_grad=True)
+    # Do forward pass in fp16 and fp32
+    pt_norms_fp16 = pt_norm(pt_input_fp16, dim)
+    pt_norms_control = pt_norm(pt_input_control, dim)
+    pt_output_fp16    = pt_input_fp16   *(pt_g_fp16   /pt_norms_fp16   )
+    pt_output_control = pt_input_control*(pt_g_control/pt_norms_control)
+    # Run the Cuda version
+    pLpInput_cuda = torch.cuda.FloatTensor(*dims      ).fill_(0.)
+    pLpg_cuda     = torch.cuda.FloatTensor(*norm_shape).fill_(0.)
+    if HALF:
+        pLpInput_cuda = pLpInput_cuda.half()
+        pLpg_cuda     = pLpg_cuda    .half()
+    torch.cuda.nvtx.range_push("kernel weight norm backward")
+    apex._C.weight_norm_bwd(pLpInput_cuda, 
+                            pLpg_cuda, 
+                            pLpOutput_fp16,
+                            pt_input_fp16, 
+                            pt_g_fp16,
+                            pt_norms_control.data,
+                            dim)
+    torch.cuda.nvtx.range_pop()
+    print("grad_output:  ", pLpOutput_fp16.data)
+    print(" grad_input:  ", pLpInput_cuda)
+    print(" savedInput:  ", pt_input_fp16.data)
+    print("pt_norms_control:  ", pt_norms_control.data)
+    print("pt_norms_fp16:  ", pt_norms_fp16.data)
+    torch.cuda.nvtx.range_push("pytorch fp16 backward")
+    pt_output_fp16   .backward(gradient=pLpOutput_fp16   , create_graph=True)
+    torch.cuda.nvtx.range_pop()
+    torch.cuda.nvtx.range_push("pytorch fp32 backward")
+    pt_output_control.backward(gradient=pLpOutput_control, create_graph=True)
+    torch.cuda.nvtx.range_pop()
+    # pt_output_fp16 and pt_output_control are still saved, but
+    # pt_output_fp16.grad and pt_output_control.grad are None at this point 
+    # because the graph is freed in the backwards pass.  
+    # Specifying create_/retain_ graph don't seem to force saving of 
+    # either the intermediate variables or their gradients.
+    print("Comparing gradients wrt v")
+    torch.cuda.nvtx.range_push("compare pLpv")
+    compare(pLpInput_cuda, pt_input_fp16.grad.data, pt_input_control.grad.data, rows)
+    torch.cuda.nvtx.range_pop()
+    print("Comparing gradients wrt g")
+    torch.cuda.nvtx.range_push("compare pLpg")
+    compare(pLpg_cuda, pt_g_fp16.grad.data, pt_g_control.grad.data, pLpg_cuda.size(0))
+    torch.cuda.nvtx.range_pop()
--- a/tests/raw_ops/test_forward.py
+++ b/tests/raw_ops/test_forward.py
+import torch
+import sys
+import apex._C
+import numpy as np
+from compare import compare
+from norm import pt_norm, get_norm_shape
+torch.manual_seed(2)
+torch.cuda.manual_seed(2)
+# torch.cuda.manual_seed_all(2)
+torch.set_printoptions(precision=10)
+sizes = [
+    # (3,  512, 1024),
+    # (3,  512, 1536),
+    # (3,  768, 1536),
+    # (3,  768, 2048),
+    # (3, 1024, 2048),
+    # (1, 1024, 4096),
+    # (1, 2048, 8192),
+    # (1, 4096, 4096), # this is not one of natalia's sizes, just a reference benchmark.
+    (4096, 4096, 1), # this is not one of natalia's sizes, just a reference benchmark.
+    # (353, 55, 353), # this is not one of natalia's sizes, just a reference benchmark.
+    ]
+# rows = 3
+# cols = 512
+# fast = 1024
+HALF = True
+RAND = True
+dim = 0
+for rows, cols, fast in sizes:
+    dims = rows, cols, fast
+    print("\n\nTESTING dims = {}\n\n".format(dims))
+    if RAND:
+        pt_in = 1.*torch.cuda.FloatTensor(*dims).uniform_()
+        g = torch.cuda.FloatTensor(*get_norm_shape(pt_in, dim)).uniform_()
+    else:
+        pt_in = torch.cuda.FloatTensor(*dims).fill_(1.)
+        g = torch.cuda.FloatTensor(*get_norm_shape(pt_in, dim)).fill_(6.0)
+    # per_col = torch.arange(1,cols+1).cuda()
+    # print((rows*per_col*per_col).sqrt())
+    # pt_in *= per_col
+    cuda_out   =   torch.cuda.FloatTensor(*dims).fill_(0.)
+    cuda_norms =   torch.cuda.FloatTensor(*get_norm_shape(pt_in, dim)).fill_(0.)
+    # Save a copy of the input as float
+    pt_in_fp32 = pt_in.clone()
+    g_fp32     = g.clone()
+    if HALF:
+        pt_in    =    pt_in.half()
+        g        =        g.half()
+        cuda_out = cuda_out.half()
+    apex._C.weight_norm_fwd(cuda_out, cuda_norms, pt_in, g, dim)
+    torch.cuda.synchronize()
+    # quit()
+    print("type(cuda_out) = {}\n".format(type(cuda_out)))
+    rownorms      = pt_norm(pt_in, dim)
+    rownorms_fp32 = pt_norm(pt_in_fp32, dim)
+    print("rownorms_fp32:")
+    print(rownorms_fp32)
+    print("cuda_norms"    )
+    print(cuda_norms   )
+    # rownorms is broadcast; torch.div(pt_in, rownorms) and pt_in/rownorms work the same way
+    pt_out         = pt_in*(g/rownorms)
+    pt_out_control = pt_in_fp32*(g_fp32/rownorms_fp32)
+    compare(cuda_out, pt_out, pt_out_control, rows)