import torch import numpy as np def compare(cuda_out, pt_out, pt_out_control, rows): # print( "Pytorch ops in fp16: ", pt_out ) # print( "Kernel result: ", cuda_out ) # print("Control (Pytorch ops, sticking to fp32): ", pt_out_control) # Make upconverted copies for error check against fp32 control cuda_out_fp32 = cuda_out.float() pt_out_fp32 = pt_out.float() # Flatten all but the slowest dimension cuda_out = cuda_out.view(rows,-1) pt_out = pt_out.view(rows,-1) cuda_out_fp32 = cuda_out_fp32.view(rows,-1) pt_out_fp32 = pt_out_fp32.view(rows,-1) pt_out_control = pt_out_control.view(rows,-1) cuda_maxdiffs, cuda_maxdiff_locs = torch.max((pt_out_control - cuda_out_fp32).abs(),1) pt_maxdiffs, pt_maxdiff_locs = torch.max((pt_out_control - pt_out_fp32 ).abs(),1) print( "cuda_maxdiffs = ", cuda_maxdiffs ) # print("cuda_maxdiff_locs = ", cuda_maxdiff_locs) print( "pt_maxdiffs = ", pt_maxdiffs ) # print( "pt_maxdiff_locs = ", pt_maxdiff_locs ) row_indices = torch.LongTensor(np.arange(rows)) # print("cuda_out at cuda_maxdiff_locs in each row:") # # bizarrely, this will work if you do it at the python prompt: # # print(cuda_out[row_indices,cuda_maxdiff_locs]) # # ...but it only seems to work here if you wrap with numpy arrays: # print( cuda_out[np.array(row_indices),np.array(cuda_maxdiff_locs)]) # print("pt_out_control at cuda_maxdiff_locs in each row:") # print(pt_out_control[np.array(row_indices),np.array(cuda_maxdiff_locs)]) # # print("pt_out at pt_maxdiff_locs in each row:" ) # print( pt_out[np.array(row_indices),np.array(pt_maxdiff_locs)]) # print("pt_out_control at pt_maxdiff_locs in each row:" ) # print(pt_out_control[np.array(row_indices),np.array(pt_maxdiff_locs)])