Commit d506eff2 authored by Michael Carilli's avatar Michael Carilli
Browse files

Macros based on torch.__version__ to compile with 0.4 and 0.5

parent 61b452e8
...@@ -5,8 +5,8 @@ This repo is designed to hold PyTorch modules and utilities that are under activ ...@@ -5,8 +5,8 @@ This repo is designed to hold PyTorch modules and utilities that are under activ
# Requirements # Requirements
Python 3 Python 3
PyTorch 0.3 or newer
CUDA 9 CUDA 9
PyTorch 0.4 or newer. We recommend to use the latest stable release, obtainable from https://pytorch.org/. We also test against the latest master branch, obtainable from https://github.com/pytorch/pytorch. If you have any problems building, please file an issue.
# [Full Documentation](https://nvidia.github.io/apex) # [Full Documentation](https://nvidia.github.io/apex)
...@@ -23,7 +23,7 @@ import apex ...@@ -23,7 +23,7 @@ import apex
``` ```
and optionally (if required for your use) and optionally (if required for your use)
``` ```
import apex._C as apex_backend import apex_C as apex_backend
``` ```
# What's included # What's included
......
...@@ -4,7 +4,7 @@ import warnings ...@@ -4,7 +4,7 @@ import warnings
import torch import torch
from apex._C import scale_check_overflow from apex_C import scale_check_overflow
class AmpHandle(object): class AmpHandle(object):
def __init__(self, enable_caching=True): def __init__(self, enable_caching=True):
......
import torch import torch
from torch.autograd import Variable from torch.autograd import Variable
from torch.autograd.function import Function, once_differentiable from torch.autograd.function import Function, once_differentiable
import apex._C import apex_C
def check_contig_cuda(tensors, names): def check_contig_cuda(tensors, names):
for tensor, name in zip(tensors, names): for tensor, name in zip(tensors, names):
...@@ -71,7 +71,7 @@ class Fused_Weight_Norm(Function): ...@@ -71,7 +71,7 @@ class Fused_Weight_Norm(Function):
[output_size(0),1,1,...]. [output_size(0),1,1,...].
""" """
apex._C.weight_norm_fwd(output, norms, input, g, dim) apex_C.weight_norm_fwd(output, norms, input, g, dim)
ctx.save_for_backward(input, g) ctx.save_for_backward(input, g)
# save_for_backward can only save input or output tensors, # save_for_backward can only save input or output tensors,
...@@ -102,7 +102,7 @@ class Fused_Weight_Norm(Function): ...@@ -102,7 +102,7 @@ class Fused_Weight_Norm(Function):
grad_input = grad_output_contig.new(grad_output.size()).contiguous() grad_input = grad_output_contig.new(grad_output.size()).contiguous()
grad_g = savedg.new(savedg.size()).contiguous() grad_g = savedg.new(savedg.size()).contiguous()
apex._C.weight_norm_bwd(grad_input, apex_C.weight_norm_bwd(grad_input,
grad_g, grad_g,
grad_output_contig, grad_output_contig,
savedInput, savedInput,
......
...@@ -4,7 +4,7 @@ ...@@ -4,7 +4,7 @@
// here, but I can't make nvcc play well with torch.h. For now, use a layer of indirection // here, but I can't make nvcc play well with torch.h. For now, use a layer of indirection
// and separate .cu implementation files. // and separate .cu implementation files.
// If we want everything to be part of "apex._C", we need all the interface functions defined // If we want everything to be part of "apex_C", we need all the interface functions defined
// in this file, or linker will complain about "multiple definitions of PyInit". // in this file, or linker will complain about "multiple definitions of PyInit".
// TODO: multiple modules? // TODO: multiple modules?
...@@ -54,15 +54,23 @@ void scale_check_overflow_cuda ...@@ -54,15 +54,23 @@ void scale_check_overflow_cuda
float scale, float scale,
const at::Tensor& d_buf); const at::Tensor& d_buf);
#ifdef VERSION_LE_04
#define VERSION_AGNOSTIC_CHECK AT_ASSERT
#else
#define VERSION_AGNOSTIC_CHECK AT_CHECK
#endif
void scale_check_overflow void scale_check_overflow
(at::Tensor grads, (at::Tensor grads,
float scale, float scale,
at::Tensor overflow_buf) at::Tensor overflow_buf)
{ {
AT_CHECK(grads.type().is_cuda(), "x must be a CUDA tensor"); VERSION_AGNOSTIC_CHECK
AT_CHECK(overflow_buf.type().is_cuda(), "y must be a CUDA tensor"); (grads.type().is_cuda(), "x must be a CUDA tensor");
VERSION_AGNOSTIC_CHECK
(overflow_buf.type().is_cuda(), "y must be a CUDA tensor");
// Make sure we are downscaling the FP32 master grads // Make sure we are downscaling the FP32 master grads
AT_CHECK VERSION_AGNOSTIC_CHECK
(grads.type().scalarType() == at::ScalarType::Float, (grads.type().scalarType() == at::ScalarType::Float,
"grads supplied to scale_check_overflow should be fp32 (master grads).") "grads supplied to scale_check_overflow should be fp32 (master grads).")
scale_check_overflow_cuda(grads, scale, overflow_buf); scale_check_overflow_cuda(grads, scale, overflow_buf);
......
...@@ -13,6 +13,19 @@ ...@@ -13,6 +13,19 @@
#define __SYNCWARP #define __SYNCWARP
#endif #endif
#ifdef VERSION_LE_04
#define USING_ACCSCALAR_T using accscalar_t = cuda::acc_type<cuda_scalar_t>;
#else
#define USING_ACCSCALAR_T using accscalar_t = acc_type<cuda_scalar_t, true>;
#endif
#ifdef VERSION_LE_04
#define REDUCE_ADD ReduceAdd<accscalar_t, accscalar_t>()
#else
#define REDUCE_ADD ReduceAdd<accscalar_t>()
#endif
// Block size for weight_norm_*_first_dim_kernel. // Block size for weight_norm_*_first_dim_kernel.
// Currently, kernels are non-persistent. // Currently, kernels are non-persistent.
// Dialing up the block size to, say 1024, can improve performance by // Dialing up the block size to, say 1024, can improve performance by
......
#include <ATen/ATen.h> #include <ATen/ATen.h>
#include "ATen/AccumulateType.h" // #include "ATen/AccumulateType.h"
#include "ATen/cuda/CUDATensorMethods.cuh" #include "ATen/cuda/CUDATensorMethods.cuh"
#include "ATen/cuda/CUDATypeConversion.cuh" #include "ATen/cuda/CUDATypeConversion.cuh"
#include <THC/THCTensorMathReduce.cuh> #include <THC/THCTensorMathReduce.cuh>
......
#include "kernel_utils.cuh" #include "kernel_utils.cuh"
#include <ATen/ATen.h> #include <ATen/ATen.h>
#ifdef VERSION_LE_04
#include "ATen/cuda/AccumulateType.cuh"
#else
#include "ATen/AccumulateType.h" #include "ATen/AccumulateType.h"
#endif
#include "ATen/cuda/CUDATensorMethods.cuh" #include "ATen/cuda/CUDATensorMethods.cuh"
#include "ATen/cuda/CUDATypeConversion.cuh" #include "ATen/cuda/CUDATypeConversion.cuh"
#include <THC/THCTensorMathReduce.cuh> #include <THC/THCTensorMathReduce.cuh>
...@@ -40,7 +46,7 @@ __global__ void weight_norm_bwd_first_dim_kernel ...@@ -40,7 +46,7 @@ __global__ void weight_norm_bwd_first_dim_kernel
thread_sum += pLpwi*savedvi; // AccumOp, could do Kahan here thread_sum += pLpwi*savedvi; // AccumOp, could do Kahan here
} }
reduce_block_into_lanes(s, thread_sum, 1, ReduceAdd<accscalar_t>()); reduce_block_into_lanes(s, thread_sum, 1, REDUCE_ADD);
accscalar_t result = s[0]; accscalar_t result = s[0];
// Could choose to save reciprocal of norm instead I suppose, but norms is probably // Could choose to save reciprocal of norm instead I suppose, but norms is probably
...@@ -99,7 +105,7 @@ __global__ void weight_norm_bwd_last_dim_kernel ...@@ -99,7 +105,7 @@ __global__ void weight_norm_bwd_last_dim_kernel
slower_dims_location += blockDim.y; slower_dims_location += blockDim.y;
} }
reduce_block_into_lanes(s, thread_sum, blockDim.x, ReduceAdd<accscalar_t>()); reduce_block_into_lanes(s, thread_sum, blockDim.x, REDUCE_ADD);
accscalar_t result = s[threadIdx.x]; accscalar_t result = s[threadIdx.x];
// Broadcast load; could use shared memory instead. // Broadcast load; could use shared memory instead.
...@@ -159,7 +165,7 @@ void weight_norm_bwd_cuda ...@@ -159,7 +165,7 @@ void weight_norm_bwd_cuda
[&] [&]
{ {
using cuda_scalar_t = cuda::type<scalar_t>; using cuda_scalar_t = cuda::type<scalar_t>;
using accscalar_t = acc_type<cuda_scalar_t, true>; USING_ACCSCALAR_T
weight_norm_bwd_first_dim_kernel weight_norm_bwd_first_dim_kernel
<<<pLpw.size(0), <<<pLpw.size(0),
...@@ -192,7 +198,7 @@ void weight_norm_bwd_cuda ...@@ -192,7 +198,7 @@ void weight_norm_bwd_cuda
[&] [&]
{ {
using cuda_scalar_t = cuda::type<scalar_t>; using cuda_scalar_t = cuda::type<scalar_t>;
using accscalar_t = acc_type<cuda_scalar_t, true>; USING_ACCSCALAR_T
weight_norm_bwd_last_dim_kernel weight_norm_bwd_last_dim_kernel
<<<(fast_dim_size+TILE_W-1)/TILE_W, <<<(fast_dim_size+TILE_W-1)/TILE_W,
......
#include "kernel_utils.cuh" #include "kernel_utils.cuh"
#include <ATen/ATen.h> #include <ATen/ATen.h>
#ifdef VERSION_LE_04
#include "ATen/cuda/AccumulateType.cuh"
#else
#include "ATen/AccumulateType.h" #include "ATen/AccumulateType.h"
#endif
#include "ATen/cuda/CUDATensorMethods.cuh" #include "ATen/cuda/CUDATensorMethods.cuh"
#include "ATen/cuda/CUDATypeConversion.cuh" #include "ATen/cuda/CUDATypeConversion.cuh"
#include <THC/THCTensorMathReduce.cuh> #include <THC/THCTensorMathReduce.cuh>
...@@ -38,7 +44,7 @@ __global__ void weight_norm_fwd_first_dim_kernel ...@@ -38,7 +44,7 @@ __global__ void weight_norm_fwd_first_dim_kernel
thread_sum += val_f*val_f; // AccumOp, could do Kahan here thread_sum += val_f*val_f; // AccumOp, could do Kahan here
} }
reduce_block_into_lanes(s, thread_sum, 1, ReduceAdd<accscalar_t>()); reduce_block_into_lanes(s, thread_sum, 1, REDUCE_ADD);
accscalar_t result = s[0]; accscalar_t result = s[0];
result = sqrtf(result); result = sqrtf(result);
...@@ -92,7 +98,7 @@ __global__ void weight_norm_fwd_last_dim_kernel ...@@ -92,7 +98,7 @@ __global__ void weight_norm_fwd_last_dim_kernel
slower_dims_location += blockDim.y; slower_dims_location += blockDim.y;
} }
reduce_block_into_lanes(s, thread_sum, blockDim.x, ReduceAdd<accscalar_t>()); reduce_block_into_lanes(s, thread_sum, blockDim.x, REDUCE_ADD);
// Better to pass an EpilogueOp to reduce_block_into_lanes, implement later // Better to pass an EpilogueOp to reduce_block_into_lanes, implement later
if(threadIdx.y == 0) if(threadIdx.y == 0)
...@@ -150,7 +156,7 @@ void weight_norm_fwd_cuda ...@@ -150,7 +156,7 @@ void weight_norm_fwd_cuda
[&] [&]
{ {
using cuda_scalar_t = cuda::type<scalar_t>; using cuda_scalar_t = cuda::type<scalar_t>;
using accscalar_t = acc_type<cuda_scalar_t, true>; USING_ACCSCALAR_T
weight_norm_fwd_first_dim_kernel weight_norm_fwd_first_dim_kernel
<<<v.size(0), <<<v.size(0),
...@@ -181,7 +187,7 @@ void weight_norm_fwd_cuda ...@@ -181,7 +187,7 @@ void weight_norm_fwd_cuda
[&] [&]
{ {
using cuda_scalar_t = cuda::type<scalar_t>; using cuda_scalar_t = cuda::type<scalar_t>;
using accscalar_t = acc_type<cuda_scalar_t, true>; USING_ACCSCALAR_T
// just trying this formatting out to see how it feels... // just trying this formatting out to see how it feels...
weight_norm_fwd_last_dim_kernel weight_norm_fwd_last_dim_kernel
......
...@@ -14,6 +14,18 @@ if not torch.cuda.is_available(): ...@@ -14,6 +14,18 @@ if not torch.cuda.is_available():
print("Warning: Torch did not find available GPUs on this system.\n", print("Warning: Torch did not find available GPUs on this system.\n",
"If your intention is to cross-compile, this is not an error.") "If your intention is to cross-compile, this is not an error.")
print("torch.__version__ = ", torch.__version__)
TORCH_MAJOR = int(torch.__version__.split('.')[0])
TORCH_MINOR = int(torch.__version__.split('.')[1])
if TORCH_MAJOR == 0 and TORCH_MINOR < 4:
raise RuntimeError("APEx requires Pytorch 0.4 or newer.\n" +
"The latest stable release can be obtained from https://pytorch.org/")
version_le_04 = []
if TORCH_MAJOR == 0 and TORCH_MINOR == 4:
version_le_04 = ['-DVERSION_LE_04']
def find(path, regex_func, collect=False): def find(path, regex_func, collect=False):
collection = [] if collect else None collection = [] if collect else None
for root, dirs, files in os.walk(path): for root, dirs, files in os.walk(path):
...@@ -35,37 +47,37 @@ def get_cuda_version(): ...@@ -35,37 +47,37 @@ def get_cuda_version():
CUDA_LIB = re.compile(', V[0-9]+\.[0-9]+\.[0-9]+').search(nvcc_output).group(0).split('V')[1] CUDA_LIB = re.compile(', V[0-9]+\.[0-9]+\.[0-9]+').search(nvcc_output).group(0).split('V')[1]
print("Found CUDA_LIB = ", CUDA_LIB) print("Found CUDA_LIB = ", CUDA_LIB)
CUDA_MAJOR_VERSION = int(CUDA_LIB.split('.')[0]) CUDA_MAJOR = int(CUDA_LIB.split('.')[0])
print("Found CUDA_MAJOR_VERSION = ", CUDA_MAJOR_VERSION) print("Found CUDA_MAJOR = ", CUDA_MAJOR)
if CUDA_MAJOR_VERSION < 8: if CUDA_MAJOR < 8:
raise RuntimeError("APex requires CUDA 8.0 or newer") raise RuntimeError("APex requires CUDA 8.0 or newer")
return CUDA_MAJOR_VERSION return CUDA_MAJOR
if CUDA_HOME is not None: if CUDA_HOME is not None:
print("Found CUDA_HOME = ", CUDA_HOME) print("Found CUDA_HOME = ", CUDA_HOME)
CUDA_MAJOR_VERSION = get_cuda_version() CUDA_MAJOR = get_cuda_version()
gencodes = ['-gencode', 'arch=compute_52,code=sm_52', gencodes = ['-gencode', 'arch=compute_52,code=sm_52',
'-gencode', 'arch=compute_60,code=sm_60', '-gencode', 'arch=compute_60,code=sm_60',
'-gencode', 'arch=compute_61,code=sm_61',] '-gencode', 'arch=compute_61,code=sm_61',]
if CUDA_MAJOR_VERSION > 8: if CUDA_MAJOR > 8:
gencodes += ['-gencode', 'arch=compute_70,code=sm_70', gencodes += ['-gencode', 'arch=compute_70,code=sm_70',
'-gencode', 'arch=compute_70,code=compute_70',] '-gencode', 'arch=compute_70,code=compute_70',]
ext_modules = [] ext_modules = []
extension = CUDAExtension( extension = CUDAExtension(
'apex._C', [ 'apex_C', [
'csrc/interface.cpp', 'csrc/interface.cpp',
'csrc/weight_norm_fwd_cuda.cu', 'csrc/weight_norm_fwd_cuda.cu',
'csrc/weight_norm_bwd_cuda.cu', 'csrc/weight_norm_bwd_cuda.cu',
'csrc/scale_cuda.cu', 'csrc/scale_cuda.cu',
], ],
extra_compile_args={'cxx': ['-g'], extra_compile_args={'cxx': ['-g'] + version_le_04,
'nvcc': ['-O3'] + gencodes}) 'nvcc': ['-O3'] + version_le_04 + gencodes})
ext_modules.append(extension) ext_modules.append(extension)
else: else:
raise RuntimeError("Could not find Cuda install directory") raise RuntimeError("Could not find Cuda install directory")
......
...@@ -3,9 +3,9 @@ import numpy as np ...@@ -3,9 +3,9 @@ import numpy as np
def compare(cuda_out, pt_out, pt_out_control, rows): def compare(cuda_out, pt_out, pt_out_control, rows):
print( "Pytorch ops in fp16: ", pt_out ) # print( "Pytorch ops in fp16: ", pt_out )
print( "Kernel result: ", cuda_out ) # print( "Kernel result: ", cuda_out )
print("Control (Pytorch ops, sticking to fp32): ", pt_out_control) # print("Control (Pytorch ops, sticking to fp32): ", pt_out_control)
# Make upconverted copies for error check against fp32 control # Make upconverted copies for error check against fp32 control
cuda_out_fp32 = cuda_out.float() cuda_out_fp32 = cuda_out.float()
...@@ -22,21 +22,21 @@ def compare(cuda_out, pt_out, pt_out_control, rows): ...@@ -22,21 +22,21 @@ def compare(cuda_out, pt_out, pt_out_control, rows):
pt_maxdiffs, pt_maxdiff_locs = torch.max((pt_out_control - pt_out_fp32 ).abs(),1) pt_maxdiffs, pt_maxdiff_locs = torch.max((pt_out_control - pt_out_fp32 ).abs(),1)
print( "cuda_maxdiffs = ", cuda_maxdiffs ) print( "cuda_maxdiffs = ", cuda_maxdiffs )
print("cuda_maxdiff_locs = ", cuda_maxdiff_locs) # print("cuda_maxdiff_locs = ", cuda_maxdiff_locs)
print( "pt_maxdiffs = ", pt_maxdiffs ) print( "pt_maxdiffs = ", pt_maxdiffs )
print( "pt_maxdiff_locs = ", pt_maxdiff_locs ) # print( "pt_maxdiff_locs = ", pt_maxdiff_locs )
row_indices = torch.LongTensor(np.arange(rows)) row_indices = torch.LongTensor(np.arange(rows))
print("cuda_out at cuda_maxdiff_locs in each row:") # print("cuda_out at cuda_maxdiff_locs in each row:")
# bizarrely, this will work if you do it at the python prompt: # # bizarrely, this will work if you do it at the python prompt:
# print(cuda_out[row_indices,cuda_maxdiff_locs]) # # print(cuda_out[row_indices,cuda_maxdiff_locs])
# ...but it only seems to work here if you wrap with numpy arrays: # # ...but it only seems to work here if you wrap with numpy arrays:
print( cuda_out[np.array(row_indices),np.array(cuda_maxdiff_locs)]) # print( cuda_out[np.array(row_indices),np.array(cuda_maxdiff_locs)])
print("pt_out_control at cuda_maxdiff_locs in each row:") # print("pt_out_control at cuda_maxdiff_locs in each row:")
print(pt_out_control[np.array(row_indices),np.array(cuda_maxdiff_locs)]) # print(pt_out_control[np.array(row_indices),np.array(cuda_maxdiff_locs)])
#
print("pt_out at pt_maxdiff_locs in each row:" ) # print("pt_out at pt_maxdiff_locs in each row:" )
print( pt_out[np.array(row_indices),np.array(pt_maxdiff_locs)]) # print( pt_out[np.array(row_indices),np.array(pt_maxdiff_locs)])
print("pt_out_control at pt_maxdiff_locs in each row:" ) # print("pt_out_control at pt_maxdiff_locs in each row:" )
print(pt_out_control[np.array(row_indices),np.array(pt_maxdiff_locs)]) # print(pt_out_control[np.array(row_indices),np.array(pt_maxdiff_locs)])
import torch import torch
from torch.autograd import Variable from torch.autograd import Variable
import apex._C import apex_C
import numpy as np import numpy as np
from compare import compare from compare import compare
from norm import pt_norm, get_norm_shape from norm import pt_norm, get_norm_shape
...@@ -88,7 +88,7 @@ for rows, cols, fast in sizes: ...@@ -88,7 +88,7 @@ for rows, cols, fast in sizes:
pLpg_cuda = pLpg_cuda .half() pLpg_cuda = pLpg_cuda .half()
torch.cuda.nvtx.range_push("kernel weight norm backward") torch.cuda.nvtx.range_push("kernel weight norm backward")
apex._C.weight_norm_bwd(pLpInput_cuda, apex_C.weight_norm_bwd(pLpInput_cuda,
pLpg_cuda, pLpg_cuda,
pLpOutput_fp16, pLpOutput_fp16,
pt_input_fp16, pt_input_fp16,
......
import torch import torch
import sys import sys
import apex._C import apex_C
import numpy as np import numpy as np
from compare import compare from compare import compare
from norm import pt_norm, get_norm_shape from norm import pt_norm, get_norm_shape
...@@ -60,7 +60,7 @@ for rows, cols, fast in sizes: ...@@ -60,7 +60,7 @@ for rows, cols, fast in sizes:
g = g.half() g = g.half()
cuda_out = cuda_out.half() cuda_out = cuda_out.half()
apex._C.weight_norm_fwd(cuda_out, cuda_norms, pt_in, g, dim) apex_C.weight_norm_fwd(cuda_out, cuda_norms, pt_in, g, dim)
torch.cuda.synchronize() torch.cuda.synchronize()
# quit() # quit()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment